diff --git a/.github/workflows/dockerfile_sanity.yml b/.github/workflows/dockerfile_sanity.yml index 060b80ca45..738be80319 100644 --- a/.github/workflows/dockerfile_sanity.yml +++ b/.github/workflows/dockerfile_sanity.yml @@ -5,13 +5,13 @@ on: branches: - main paths: - - "docker/Dockerfile.intel" - + - 'Dockerfile.ipex' + pull_request: branches: - main paths: - - "docker/Dockerfile.intel" + - 'Dockerfile.ipex' jobs: build_and_run: @@ -27,7 +27,7 @@ jobs: - name: Build and Run Docker Image run: | IMAGE_NAME="intel_image:latest" - docker build -f docker/Dockerfile.intel -t $IMAGE_NAME . + docker build -f Dockerfile.ipex -t $IMAGE_NAME . if [ $? -ne 0 ]; then echo "Docker image build failed." exit 1 diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index 7583c51078..db35324a9e 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -1,6 +1,7 @@ name: OpenVINO - Test on: + workflow_dispatch: push: branches: - main @@ -46,9 +47,9 @@ jobs: pip install .[openvino,openvino-tokenizers,diffusers,tests] transformers[testing] - if: ${{ matrix.transformers-version != 'latest' }} - name: Downgrade Transformers and Accelerate + name: Install specific dependencies and versions required for older transformers run: | - pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* + pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.30.* transformers_stream_generator - if: ${{ matrix.test-pattern == '*modeling*' }} name: Uninstall NNCF diff --git a/.github/workflows/test_openvino_slow.yml b/.github/workflows/test_openvino_slow.yml index 9ad5ef2691..8c3d9b2d3f 100644 --- a/.github/workflows/test_openvino_slow.yml +++ b/.github/workflows/test_openvino_slow.yml @@ -46,8 +46,8 @@ jobs: pip uninstall -y nncf - if: ${{ matrix.transformers-version != 'latest' }} - name: Downgrade Transformers and Accelerate - run: pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* + name: Install specific dependencies and versions required for older transformers + run: pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.*, diffusers==0.30.* transformers_stream_generator - name: Pip freeze run: pip freeze diff --git a/Dockerfile.ipex b/Dockerfile.ipex new file mode 100644 index 0000000000..a03b1d26a3 --- /dev/null +++ b/Dockerfile.ipex @@ -0,0 +1,73 @@ +ARG PLATFORM=cpu + +FROM ubuntu:22.04 as cpu +WORKDIR /usr/src/ +RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \ + sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ + ca-certificates \ + git \ + curl \ + vim \ + build-essential \ + ccache \ + libgoogle-perftools-dev \ + numactl \ + cmake \ + libjpeg-dev \ + pybind11-dev \ + libpng-dev \ + python3 \ + python3-pip \ + && rm -rf /var/lib/apt/lists/*" +RUN /usr/sbin/update-ccache-symlinks +RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache + +ARG IPEX_VERSION=2.5.0 +ARG PYTORCH_VERSION=2.5.1 +ARG TORCHVISION_VERSION=0.20.1+cpu +ARG TORCHAUDIO_VERSION=2.5.1+cpu + +RUN python3 -m pip install --no-cache-dir \ + torch==${PYTORCH_VERSION}+cpu \ + torchvision==${TORCHVISION_VERSION} \ + torchaudio==${TORCHAUDIO_VERSION} \ + --index-url https://download.pytorch.org/whl/cpu && \ + python3 -m pip install intel-openmp -f https://download.pytorch.org/whl/torch_stable.html && \ + python3 -m pip install intel-extension-for-pytorch==$IPEX_VERSION && \ + python3 -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/cn/ && \ + python3 -m pip install --no-cache-dir py-libnuma + +ARG KMP_BLOCKTIME=1 +ENV KMP_BLOCKTIME=${KMP_BLOCKTIME} +ARG KMP_HW_SUBSET=1T +ENV KMP_HW_SUBSET=${KMP_HW_SUBSET} +ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so" + +FROM intel/intel-extension-for-pytorch:2.3.110-xpu as xpu +WORKDIR /usr/src/ + +RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \ + sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ + ca-certificates \ + git \ + curl \ + vim \ + ccache \ + libgoogle-perftools-dev \ + numactl \ + libjpeg-dev \ + pybind11-dev \ + libpng-dev \ + && rm -rf /var/lib/apt/lists/*" +RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null + +RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \ +| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y intel-basekit xpu-smi cmake ninja-build pciutils + +FROM ${PLATFORM} + +COPY optimum optimum +COPY Makefile setup.cfg setup.py pyproject.toml README.md ./ +RUN pip install . diff --git a/README.md b/README.md index 0cd317c78d..28c5800684 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ 🤗 Optimum Intel is the interface between the 🤗 Transformers and Diffusers libraries and the different tools and libraries provided by Intel to accelerate end-to-end pipelines on Intel architectures. -[Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) is an open-source library which provides optimizations for both eager mode and graph mode, however, compared to eager mode, graph mode in PyTorch* normally yields better performance from optimization techniques, such as operation fusion. +[Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) is an open-source library which provides optimizations like faster attention and operators fusion. Intel [Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) is an open-source library enabling the usage of the most popular compression techniques such as quantization, pruning and knowledge distillation. It supports automatic accuracy-driven tuning strategies in order for users to easily generate quantized model. The users can easily apply static, dynamic and aware-training quantization approaches while giving an expected accuracy criteria. It also supports different weight pruning techniques enabling the creation of pruned model giving a predefined sparsity target. @@ -159,7 +159,7 @@ optimized_model = OVModelForSequenceClassification.from_pretrained(save_dir) ## IPEX -To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. You can set `export=True` to load a PyTorch checkpoint, export your model via TorchScript and apply IPEX optimizations : both operators optimization (replaced with customized IPEX operators) and graph-level optimization (like operators fusion) will be applied on your model. +To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. It will load a PyTorch checkpoint, and apply IPEX operators optimization (replaced with customized IPEX operators). ```diff from transformers import AutoTokenizer, pipeline - from transformers import AutoModelForCausalLM diff --git a/docker/Dockerfile.intel b/docker/Dockerfile.intel deleted file mode 100644 index ad4ff63e8c..0000000000 --- a/docker/Dockerfile.intel +++ /dev/null @@ -1,53 +0,0 @@ -# syntax = docker/dockerfile:1 -# based onhttps://github.com/pytorch/pytorch/blob/master/Dockerfile -# -# NOTE: To build this you will need a docker version >= 19.03 and DOCKER_BUILDKIT=1 -# -# If you do not use buildkit you are not going to have a good time -# -# For reference: -# https://docs.docker.com/develop/develop-images/build_enhancements/ - -ARG BASE_IMAGE=ubuntu:22.04 -FROM ${BASE_IMAGE} - -RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \ - sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ - ca-certificates \ - git \ - curl \ - vim \ - build-essential \ - ccache \ - libgoogle-perftools-dev \ - numactl \ - cmake \ - libjpeg-dev \ - pybind11-dev \ - libpng-dev \ - python3 \ - python3-pip \ - && rm -rf /var/lib/apt/lists/*" -RUN /usr/sbin/update-ccache-symlinks -RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache - -ARG IPEX_VERSION=2.3.100 -ARG PYTORCH_VERSION=2.3.1 -ARG TORCHVISION_VERSION=0.18.1+cpu -ARG TORCHAUDIO_VERSION=2.3.1+cpu - -RUN python3 -m pip install --no-cache-dir \ - intel-openmp \ - torch==${PYTORCH_VERSION}+cpu \ - torchvision==${TORCHVISION_VERSION} \ - torchaudio==${TORCHAUDIO_VERSION} \ - -f https://download.pytorch.org/whl/torch_stable.html && \ - python3 -m pip install intel-extension-for-pytorch==$IPEX_VERSION && \ - python3 -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ && \ - python3 -m pip install --no-cache-dir py-libnuma - -ARG KMP_BLOCKTIME=1 -ENV KMP_BLOCKTIME=${KMP_BLOCKTIME} -ARG KMP_HW_SUBSET=1T -ENV KMP_HW_SUBSET=${KMP_HW_SUBSET} -ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so" diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx index 4876885219..3e7e458c02 100644 --- a/docs/source/openvino/export.mdx +++ b/docs/source/openvino/export.mdx @@ -78,7 +78,8 @@ Optional arguments: --ratio RATIO A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80% of the layers will be quantized to int4 while 20% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size - and inference latency. Default value is 1.0. + and inference latency. Default value is 1.0. Note: If dataset is provided, and the ratio is + less than 1.0, then data-aware mixed precision assignment will be applied. --sym Whether to apply symmetric quantization --group-size GROUP_SIZE The group size to use for quantization. Recommended value is 128 and -1 uses per-column @@ -94,7 +95,9 @@ Optional arguments: can use the one from the list ['auto','wikitext2','c4','c4-new']. With 'auto' the dataset will be collected from model's generations. For diffusion models it should be on of ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. For - visual language models the dataset must be set to 'contextual'. + visual language models the dataset must be set to 'contextual'. Note: if none of the data-aware + compression algorithms are selected and ratio parameter is omitted or equals 1.0, the dataset + argument will not have an effect on the resulting model. --all-layers Whether embeddings and last MatMul layers should be compressed to INT4. If not provided an weight compression is applied, they are compressed to INT8. --awq Whether to apply AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs, but diff --git a/notebooks/ipex/text_generation.ipynb b/notebooks/ipex/text_generation.ipynb index d1a62d9201..4c97d5b6b0 100644 --- a/notebooks/ipex/text_generation.ipynb +++ b/notebooks/ipex/text_generation.ipynb @@ -11,7 +11,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. You can set `export=True` to load a PyTorch checkpoint, export your model via TorchScript and apply IPEX optimizations : both operators optimization (replaced with customized IPEX operators) and graph-level optimization (like operators fusion) will be applied on your model." + "To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. It could apply IPEX, providing optimizations like faster attention and operators fusion." ] }, { @@ -60,7 +60,7 @@ } ], "source": [ - "model = IPEXModelForCausalLM.from_pretrained(\"gpt2\", torch_dtype=torch.bfloat16, export=True)\n", + "model = IPEXModelForCausalLM.from_pretrained(\"gpt2\", torch_dtype=torch.bfloat16)\n", "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n", "input_sentence = [\"Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?\"]\n", "model_inputs = tokenizer(input_sentence, return_tensors=\"pt\")\n", diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 61c21c5c72..6965efcb54 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -102,7 +102,8 @@ def parse_args_openvino(parser: "ArgumentParser"): default=None, help=( "A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80%% of the layers will be quantized to int4 " - "while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0." + "while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0. " + "Note: If dataset is provided, and the ratio is less than 1.0, then data-aware mixed precision assignment will be applied." ), ) optional_group.add_argument( @@ -140,7 +141,9 @@ def parse_args_openvino(parser: "ArgumentParser"): "dataset will be collected from model's generations. " "For diffusion models it should be on of ['conceptual_captions'," "'laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. " - "For visual language models the dataset must be set to 'contextual'." + "For visual language models the dataset must be set to 'contextual'. " + "Note: if none of the data-aware compression algorithms are selected and ratio parameter is omitted or " + "equals 1.0, the dataset argument will not have an effect on the resulting model." ), ) optional_group.add_argument( diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py index 169f46076d..ec9a18e04c 100755 --- a/optimum/exporters/ipex/modeling_utils.py +++ b/optimum/exporters/ipex/modeling_utils.py @@ -207,7 +207,7 @@ def _llama_model_forward( position_ids = torch.arange( past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device ) - position_ids = position_ids.unsqueeze(0) + position_ids = position_ids.unsqueeze(0).repeat_interleave(input_ids.shape[0], 0) if inputs_embeds is None: inputs_embeds = self.embed_tokens(input_ids) @@ -324,7 +324,7 @@ def _falcon_model_forward( ) if position_ids is None: - position_ids = cache_position.unsqueeze(0) + position_ids = cache_position.unsqueeze(0).repeat_interleave(input_ids.shape[0], 0) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head @@ -446,7 +446,7 @@ def _gpt2_model_forward( past_length = past_key_values.get_seq_length() if past_key_values is not None else 0 if position_ids is None: position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device) - position_ids = position_ids.unsqueeze(0) + position_ids = position_ids.unsqueeze(0).repeat_interleave(input_ids.shape[0], 0) if inputs_embeds is None: inputs_embeds = self.wte(input_ids) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 128643eb1d..592cd85a4b 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -474,9 +474,6 @@ class StoreAttr(object): from optimum.intel.openvino.quantization import _weight_only_quantization _weight_only_quantization(submodel, quantization_config) - if "text-generation" in task: - submodel.set_rt_info("u8", ["runtime_options", "KV_CACHE_PRECISION"]) - compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml" save_model(submodel, compressed_submodel_path, compress_to_fp16=False) del submodel diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index d9c0165d98..1ffcabb48b 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -1804,8 +1804,9 @@ def __init__( normalized_config: NormalizedVisionConfig, batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], - width: int = DEFAULT_DUMMY_SHAPES["width"], - height: int = DEFAULT_DUMMY_SHAPES["height"], + width: int = DEFAULT_DUMMY_SHAPES["width"] // 4, + height: int = DEFAULT_DUMMY_SHAPES["height"] // 4, + # Reduce img shape by 4 for FLUX to reduce memory usage on conversion **kwargs, ): super().__init__(task, normalized_config, batch_size, num_channels, width, height, **kwargs) diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index 7b0ae0f47f..ad9fdca078 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -51,6 +51,7 @@ "IPEXModel", ] else: + _import_structure["utils.dummy_ipex_objects"] = [] _import_structure["ipex"] = [ "IPEXModelForCausalLM", "IPEXModelForSeq2SeqLM", diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index bf2be43e52..01df5d85e6 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -62,7 +62,7 @@ _IPEX_EXPORTED_GENERATION_METHODS = ("sample", "greedy_search", "beam_sample", "beam_search", "assisted_generation") _IPEX_MINIMUM_VERSION_FOR_COMPILE = "2.5.0" # TODO: Some models are already fixed in torch 2.6, will enable them when torch upgrading to 2.6 -_COMPILE_NOT_READY_MODEL_TYPES = ("electra", "roformer", "beit", "llama", "falcon", "gpt2") +_COMPILE_NOT_READY_MODEL_TYPES = ("electra", "roformer", "gpt_neox", "beit", "llama", "falcon", "gpt2") def _is_patched_with_ipex(model, task, use_cache: bool = True): @@ -291,6 +291,8 @@ def forward( attention_mask: Optional[torch.FloatTensor] = None, **kwargs, ) -> CausalLMOutputWithPast: + if self.add_patch and input_ids is not None and attention_mask is None: + attention_mask = torch.ones_like(input_ids) return self.model(input_ids=input_ids, attention_mask=attention_mask, **kwargs) def _prepare_generation_config( @@ -298,7 +300,7 @@ def _prepare_generation_config( ) -> Tuple[GenerationConfig, Dict]: generation_config, model_kwargs = super()._prepare_generation_config(generation_config, **kwargs) generation_method = generation_config.get_generation_mode().value - if self.compiled and generation_config.cache_implementation != "ipex_paged": + if self.compiled and generation_config.cache_implementation != "ipex_paged" and self._supports_static_cache: # Use static cache for torch compile generation_config.cache_implementation = "static" if generation_method not in _IPEX_EXPORTED_GENERATION_METHODS: diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index a0fc68361c..4fdfe368a2 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -344,6 +344,8 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase): ratio (`float`, defaults to 1.0): The ratio between baseline and backup precisions (e.g. 0.9 means 90% of layers quantized to INT4_ASYM and the rest to INT8_ASYM). + Note: If dataset is provided, and the ratio is less than 1.0, then data-aware mixed precision assignment + will be applied. all_layers (`bool`, *optional*): Defines how many layers are compressed to 4-bits while the rest are kept in 8-bit precision. sensitivity_metric (`str`, *optional*): @@ -441,7 +443,7 @@ def post_init(self): Safety checker that arguments are correct """ super().post_init() - if self.ratio is not None and not (0 <= self.ratio <= 1): + if not (0 <= self.ratio <= 1): raise ValueError("`ratio` must between 0 and 1.") if self.group_size is not None and self.group_size != -1 and self.group_size <= 0: raise ValueError("`group_size` must be greater than 0 or equal to -1") @@ -461,6 +463,18 @@ def post_init(self): or {stable_diffusion_datasets} for diffusion models, but we found {self.dataset}""" ) + if self.dataset is not None and not ( + self.quant_method == OVQuantizationMethod.AWQ + or self.scale_estimation + or self.gptq + or self.lora_correction + or (self.ratio < 1.0 and self.sensitivity_metric != nncf.SensitivityMetric.WEIGHT_QUANTIZATION_ERROR) + ): + logger.warning( + "The provided dataset won't have any effect on the resulting compressed model because no data-aware " + "quantization algorithm is selected and compression ratio is 1.0." + ) + if self.bits not in [4, 8]: raise ValueError(f"Only support quantization to [4,8] bits but found {self.bits}") diff --git a/setup.py b/setup.py index ca415fca35..d9b3b8642b 100644 --- a/setup.py +++ b/setup.py @@ -66,7 +66,7 @@ "nncf": ["nncf>=2.14.0"], "openvino": ["nncf>=2.14.0", "openvino>=2024.5.0", "openvino-tokenizers>=2024.5.0"], "neural-compressor": ["neural-compressor[pt]>3.0", "accelerate", "transformers<4.46"], - "ipex": ["intel-extension-for-pytorch>=2.4", "transformers>4.45,<4.47"], + "ipex": ["intel-extension-for-pytorch>=2.4", "transformers>4.45,<4.47", "accelerate"], "diffusers": ["diffusers"], "quality": QUALITY_REQUIRE, "tests": TESTS_REQUIRE, diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index 3c4b652c18..e3584bb112 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -49,7 +49,7 @@ IPEXSentenceTransformer, ) from optimum.utils.testing_utils import grid_parameters, require_sentence_transformers -from optimum.intel.utils.import_utils import is_sentence_transformers_available +from optimum.intel.utils.import_utils import is_sentence_transformers_available, is_torch_version if is_sentence_transformers_available(): from sentence_transformers import SentenceTransformer @@ -319,7 +319,6 @@ def test_compare_to_transformers(self, model_arch): model_id = MODEL_NAMES[model_arch] set_seed(SEED) dtype = torch.float16 if IS_XPU_AVAILABLE else torch.float32 - # Test model forward do not need cache. ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype, device_map=DEVICE) self.assertIsInstance(ipex_model.config, PretrainedConfig) tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -353,6 +352,38 @@ def test_compare_to_transformers(self, model_arch): self.assertTrue(torch.allclose(outputs.logits, loaded_model_outputs.logits, atol=1e-7)) self.assertTrue(torch.allclose(outputs.logits, init_model_outputs.logits, atol=1e-7)) + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_forward(self, model_arch): + model_id = MODEL_NAMES[model_arch] + set_seed(SEED) + dtype = torch.float16 if IS_XPU_AVAILABLE else torch.float32 + ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype, device_map=DEVICE) + self.assertIsInstance(ipex_model.config, PretrainedConfig) + input_ids = torch.Tensor([[1, 2, 3], [4, 5, 6]]).to(torch.long) + outputs = ipex_model(input_ids) + + self.assertIsInstance(outputs.logits, torch.Tensor) + + transformers_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype, device_map=DEVICE) + with torch.no_grad(): + transformers_outputs = transformers_model(input_ids) + + # Test re-load model + with tempfile.TemporaryDirectory() as tmpdirname: + ipex_model.save_pretrained(tmpdirname) + loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname, torch_dtype=dtype, device_map=DEVICE) + loaded_model_outputs = loaded_model(input_ids) + + # Test init method + init_model = self.IPEX_MODEL_CLASS(transformers_model) + init_model_outputs = init_model(input_ids) + + # Compare tensor outputs + self.assertTrue(torch.allclose(outputs.logits, transformers_outputs.logits, atol=1e-4)) + # To avoid float pointing error + self.assertTrue(torch.allclose(outputs.logits, loaded_model_outputs.logits, atol=1e-7)) + self.assertTrue(torch.allclose(outputs.logits, init_model_outputs.logits, atol=1e-7)) + @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_pipeline(self, model_arch): dtype = torch.float16 if IS_XPU_AVAILABLE else torch.float32 @@ -407,6 +438,9 @@ def test_ipex_beam_search(self, test_name, model_arch, use_cache): model = IPEXModelForCausalLM.from_pretrained( model_id, use_cache=use_cache, torch_dtype=dtype, device_map=DEVICE ) + # It will be removed when torch 2.6 released + if model_arch == "opt" and not use_cache and model.compiled and is_torch_version("<", "2.6.0"): + return if use_cache and model_arch in self.IPEX_PATCHED_SUPPORTED_ARCHITECTURES: self.assertTrue(model.add_patch) transformers_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype, device_map=DEVICE) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 1fd58646e7..d02dea3f13 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -106,8 +106,8 @@ class OVQuantizerTest(unittest.TestCase): weight_only=False, smooth_quant_alpha=0.95, ), - (14, 22, 21) if is_transformers_version("<=", "4.36.0") else (14, 22, 25), - (14, 21, 17) if is_transformers_version("<=", "4.36.0") else (14, 22, 18), + (14, 22, 21) if is_transformers_version("<=", "4.42.4") else (14, 22, 25), + (14, 21, 17) if is_transformers_version("<=", "4.42.4") else (14, 22, 18), ), ]