huggingface
diff --git a/‎.github/workflows/dockerfile_sanity.yml
Lines changed: 4 additions & 4 deletions b/‎.github/workflows/dockerfile_sanity.yml
Lines changed: 4 additions & 4 deletions
diff --git a/‎.github/workflows/test_openvino.yml
Lines changed: 4 additions & 3 deletions b/‎.github/workflows/test_openvino.yml
Lines changed: 4 additions & 3 deletions
diff --git a/‎.github/workflows/test_openvino_full.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test_openvino_full.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/test_openvino_slow.yml
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/test_openvino_slow.yml
Lines changed: 3 additions & 3 deletions
diff --git a/‎Dockerfile.ipex
Lines changed: 73 additions & 0 deletions b/‎Dockerfile.ipex
Lines changed: 73 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 3 additions & 3 deletions b/‎README.md
Lines changed: 3 additions & 3 deletions
diff --git a/‎docker/Dockerfile.intel
Lines changed: 0 additions & 53 deletions b/‎docker/Dockerfile.intel
Lines changed: 0 additions & 53 deletions
diff --git a/‎docs/source/ipex/inference.mdx
Lines changed: 2 additions & 1 deletion b/‎docs/source/ipex/inference.mdx
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/source/ipex/models.mdx
Lines changed: 1 addition & 0 deletions b/‎docs/source/ipex/models.mdx
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/openvino/export.mdx
Lines changed: 7 additions & 4 deletions b/‎docs/source/openvino/export.mdx
Lines changed: 7 additions & 4 deletions
diff --git a/‎notebooks/ipex/text_generation.ipynb
Lines changed: 2 additions & 2 deletions b/‎notebooks/ipex/text_generation.ipynb
Lines changed: 2 additions & 2 deletions
diff --git a/‎optimum/commands/export/openvino.py
Lines changed: 15 additions & 6 deletions b/‎optimum/commands/export/openvino.py
Lines changed: 15 additions & 6 deletions
@@ -5,13 +5,13 @@ on:
     branches:
       - main
     paths:
-      - "docker/Dockerfile.intel"
-
+      - 'Dockerfile.ipex'
+ 
   pull_request:
     branches:
       - main
     paths:
-      - "docker/Dockerfile.intel"
+      - 'Dockerfile.ipex'
 
 jobs:
   build_and_run:
@@ -27,7 +27,7 @@ jobs:
       - name: Build and Run Docker Image
         run: |
           IMAGE_NAME="intel_image:latest"
-          docker build -f docker/Dockerfile.intel -t $IMAGE_NAME .
+          docker build -f Dockerfile.ipex -t $IMAGE_NAME .
           if [ $? -ne 0 ]; then
             echo "Docker image build failed."
             exit 1
 
@@ -1,6 +1,7 @@
 name: OpenVINO - Test
 
 on:
+  workflow_dispatch:
   push:
     branches:
       - main
@@ -43,12 +44,12 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-          pip install .[openvino,openvino-tokenizers,diffusers,tests,tests-openvino] transformers[testing]
+          pip install .[openvino,openvino-tokenizers,diffusers,tests] transformers[testing]
 
       - if: ${{ matrix.transformers-version != 'latest' }}
-        name: Downgrade Transformers and Accelerate
+        name: Install specific dependencies and versions required for older transformers
         run: |
-          pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.*
+          pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.30.* transformers_stream_generator
 
       - if: ${{ matrix.test-pattern == '*modeling*' }}
         name: Uninstall NNCF
 
@@ -56,7 +56,7 @@ jobs:
           python -m pip install --upgrade pip
           # Install PyTorch CPU to prevent unnecessary downloading/installing of CUDA packages
           pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-          pip install .[tests,tests-openvino]
+          pip install .[tests]
 
       - name: Install openvino-nightly
         if: ${{ matrix.openvino == 'ov-nightly' }}
 
@@ -42,12 +42,12 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-          pip install .[openvino,tests,tests-openvino] transformers[testing]
+          pip install .[openvino,tests] transformers[testing]
           pip uninstall -y nncf
 
       - if: ${{ matrix.transformers-version != 'latest' }}
-        name: Downgrade Transformers and Accelerate
-        run: pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.*
+        name: Install specific dependencies and versions required for older transformers
+        run: pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.*, diffusers==0.30.* transformers_stream_generator
 
       - name: Pip freeze
         run: pip freeze
 
@@ -0,0 +1,73 @@
+ARG PLATFORM=cpu
+
+FROM ubuntu:22.04 as cpu
+WORKDIR /usr/src/
+RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
+    sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
+    ca-certificates \
+    git \
+    curl \
+    vim \
+    build-essential \
+    ccache \
+    libgoogle-perftools-dev \
+    numactl \
+    cmake \
+    libjpeg-dev \
+    pybind11-dev \
+    libpng-dev \
+    python3 \
+    python3-pip \
+    && rm -rf /var/lib/apt/lists/*"
+RUN /usr/sbin/update-ccache-symlinks
+RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
+
+ARG IPEX_VERSION=2.5.0
+ARG PYTORCH_VERSION=2.5.1
+ARG TORCHVISION_VERSION=0.20.1+cpu
+ARG TORCHAUDIO_VERSION=2.5.1+cpu
+
+RUN python3 -m pip install --no-cache-dir \
+    torch==${PYTORCH_VERSION}+cpu \
+    torchvision==${TORCHVISION_VERSION} \
+    torchaudio==${TORCHAUDIO_VERSION} \
+    --index-url https://download.pytorch.org/whl/cpu && \
+    python3 -m pip install intel-openmp -f https://download.pytorch.org/whl/torch_stable.html && \
+    python3 -m pip install intel-extension-for-pytorch==$IPEX_VERSION && \
+    python3 -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/cn/ && \
+    python3 -m pip install --no-cache-dir  py-libnuma
+
+ARG KMP_BLOCKTIME=1
+ENV KMP_BLOCKTIME=${KMP_BLOCKTIME}
+ARG KMP_HW_SUBSET=1T
+ENV KMP_HW_SUBSET=${KMP_HW_SUBSET}
+ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so"
+
+FROM intel/intel-extension-for-pytorch:2.3.110-xpu as xpu
+WORKDIR /usr/src/
+
+RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
+    sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
+    ca-certificates \
+    git \
+    curl \
+    vim \
+    ccache \
+    libgoogle-perftools-dev \
+    numactl \
+    libjpeg-dev \
+    pybind11-dev \
+    libpng-dev \
+    && rm -rf /var/lib/apt/lists/*"
+RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null
+
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
+| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y intel-basekit xpu-smi cmake ninja-build pciutils
+
+FROM ${PLATFORM}
+
+COPY optimum optimum
+COPY Makefile setup.cfg setup.py pyproject.toml README.md ./
+RUN pip install .
@@ -6,7 +6,7 @@
 
 🤗 Optimum Intel is the interface between the 🤗 Transformers and Diffusers libraries and the different tools and libraries provided by Intel to accelerate end-to-end pipelines on Intel architectures.
 
-[Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) is an open-source library which provides optimizations for both eager mode and graph mode, however, compared to eager mode, graph mode in PyTorch* normally yields better performance from optimization techniques, such as operation fusion.
+[Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) is an open-source library which provides optimizations like faster attention and operators fusion.
 
 Intel [Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) is an open-source library enabling the usage of the most popular compression techniques such as quantization, pruning and knowledge distillation. It supports automatic accuracy-driven tuning strategies in order for users to easily generate quantized model. The users can easily apply static, dynamic and aware-training quantization approaches while giving an expected accuracy criteria. It also supports different weight pruning techniques enabling the creation of pruned model giving a predefined sparsity target.
 
@@ -159,7 +159,7 @@ optimized_model = OVModelForSequenceClassification.from_pretrained(save_dir)
 
 
 ## IPEX
-To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. You can set `export=True` to load a PyTorch checkpoint, export your model via TorchScript and apply IPEX optimizations : both operators optimization (replaced with customized IPEX operators) and graph-level optimization (like operators fusion) will be applied on your model.
+To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. It will load a PyTorch checkpoint, and apply IPEX operators optimization (replaced with customized IPEX operators).
 ```diff
   from transformers import AutoTokenizer, pipeline
 - from transformers import AutoModelForCausalLM
@@ -168,7 +168,7 @@ To load your IPEX model, you can just replace your `AutoModelForXxx` class with
 
   model_id = "gpt2"
 - model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
-+ model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, export=True)
++ model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
   tokenizer = AutoTokenizer.from_pretrained(model_id)
   pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
   results = pipe("He's a dreadful magician and")
 
@@ -14,7 +14,7 @@ Optimum Intel can be used to load models from the [Hub](https://huggingface.co/m
 
 ## Loading
 
-You can load your model and apply IPEX optimizations (apply torch.compile for non-generation tasks). For supported architectures like LLaMA, BERT and ViT, further optimizations will be applied by patching the model to use custom operators.
+You can load your model and apply IPEX optimizations (apply torch.compile except text-generation tasks). For supported architectures like LLaMA, BERT and ViT, further optimizations will be applied by patching the model to use custom operators.
 For now, support is enabled for Intel CPU/GPU. Previous models converted to TorchScript will be deprecated in v1.22.
 
 ```diff
@@ -43,3 +43,4 @@ As shown in the table below, each task is associated with a class enabling to au
 | `IPEXModelForMaskedLM`               | `fill-mask`                          |
 | `IPEXModelForAudioClassification`    | `audio-classification`               |
 | `IPEXModelForCausalLM`               | `text-generation`                    |
+| `IPEXModelForSeq2SeqLM`              | `text2text-generation`               |
@@ -40,6 +40,7 @@ Here is the list of the supported architectures :
 - Roberta
 - Roformer
 - SqueezeBert
+- T5
 - UniSpeech
 - Vit
 - Wav2Vec2
 
@@ -83,14 +83,15 @@ Optional arguments:
   --ratio RATIO         A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit
                         quantization. If set to 0.8, 80% of the layers will be quantized to int4 while 20% will be
                         quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size
-                        and inference latency. Default value is 1.0.
+                        and inference latency. Default value is 1.0. Note: If dataset is provided, and the ratio is
+                        less than 1.0, then data-aware mixed precision assignment will be applied.
   --sym                 Whether to apply symmetric quantization
   --group-size GROUP_SIZE
                         The group size to use for quantization. Recommended value is 128 and -1 uses per-column
                         quantization.
   --backup-precision {none,int8_sym,int8_asym}
-                        Defines a backup precision for mixed-precision weight compression. Only valid for int4 weight
-                        format. If not provided, backup precision is int8_asym. 'none' stands for original floating-
+                        Defines a backup precision for mixed-precision weight compression. Only valid for 4-bit weight
+                        formats. If not provided, backup precision is int8_asym. 'none' stands for original floating-
                         point precision of the model weights, in this case weights are retained in their original
                         precision without any quantization. 'int8_sym' stands for 8-bit integer symmetric quantization
                         without zero point. 'int8_asym' stands for 8-bit integer asymmetric quantization with zero
@@ -99,7 +100,9 @@ Optional arguments:
                         can use the one from the list ['auto','wikitext2','c4','c4-new']. With 'auto' the dataset will
                         be collected from model's generations. For diffusion models it should be on of
                         ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. For
-                        visual language models the dataset must be set to 'contextual'.
+                        visual language models the dataset must be set to 'contextual'. Note: if none of the data-aware
+                        compression algorithms are selected and ratio parameter is omitted or equals 1.0, the dataset
+                        argument will not have an effect on the resulting model.
   --all-layers          Whether embeddings and last MatMul layers should be compressed to INT4. If not provided an
                         weight compression is applied, they are compressed to INT8.
   --awq                 Whether to apply AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs, but
 
@@ -11,7 +11,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. You can set `export=True` to load a PyTorch checkpoint, export your model via TorchScript and apply IPEX optimizations : both operators optimization (replaced with customized IPEX operators) and graph-level optimization (like operators fusion) will be applied on your model."
+    "To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. It could apply IPEX, providing optimizations like faster attention and operators fusion."
    ]
   },
   {
@@ -60,7 +60,7 @@
     }
    ],
    "source": [
-    "model = IPEXModelForCausalLM.from_pretrained(\"gpt2\", torch_dtype=torch.bfloat16, export=True)\n",
+    "model = IPEXModelForCausalLM.from_pretrained(\"gpt2\", torch_dtype=torch.bfloat16)\n",
     "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",
     "input_sentence = [\"Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?\"]\n",
     "model_inputs = tokenizer(input_sentence, return_tensors=\"pt\")\n",
 
@@ -112,7 +112,8 @@ def parse_args_openvino(parser: "ArgumentParser"):
         default=None,
         help=(
             "A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80%% of the layers will be quantized to int4 "
-            "while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0."
+            "while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0. "
+            "Note: If dataset is provided, and the ratio is less than 1.0, then data-aware mixed precision assignment will be applied."
         ),
     )
     optional_group.add_argument(
@@ -133,7 +134,7 @@ def parse_args_openvino(parser: "ArgumentParser"):
         choices=["none", "int8_sym", "int8_asym"],
         default=None,
         help=(
-            "Defines a backup precision for mixed-precision weight compression. Only valid for int4 weight format. "
+            "Defines a backup precision for mixed-precision weight compression. Only valid for 4-bit weight formats. "
             "If not provided, backup precision is int8_asym. 'none' stands for original floating-point precision of "
             "the model weights, in this case weights are retained in their original precision without any "
             "quantization. 'int8_sym' stands for 8-bit integer symmetric quantization without zero point. 'int8_asym' "
@@ -150,7 +151,9 @@ def parse_args_openvino(parser: "ArgumentParser"):
             "dataset will be collected from model's generations. "
             "For diffusion models it should be on of ['conceptual_captions',"
             "'laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. "
-            "For visual language models the dataset must be set to 'contextual'."
+            "For visual language models the dataset must be set to 'contextual'. "
+            "Note: if none of the data-aware compression algorithms are selected and ratio parameter is omitted or "
+            "equals 1.0, the dataset argument will not have an effect on the resulting model."
         ),
     )
     optional_group.add_argument(
@@ -407,6 +410,10 @@ def run(self):
                 from optimum.intel import OVStableDiffusion3Pipeline
 
                 model_cls = OVStableDiffusion3Pipeline
+            elif class_name == "FluxPipeline":
+                from optimum.intel import OVFluxPipeline
+
+                model_cls = OVFluxPipeline
             else:
                 raise NotImplementedError(f"Quantization in hybrid mode isn't supported for class {class_name}.")
 
@@ -415,8 +422,10 @@ def run(self):
             if not self.args.disable_convert_tokenizer:
                 maybe_convert_tokenizers(library_name, self.args.output, model, task=task)
         elif (
-            task.startswith("text-generation") or task in ["automatic-speech-recognition", "image-text-to-text"]
-        ) and quantize_with_dataset:
+            quantize_with_dataset
+            and (task.startswith("text-generation") or task == "automatic-speech-recognition")
+            or (task == "image-text-to-text" and quantization_config is not None)
+        ):
             if task.startswith("text-generation"):
                 from optimum.intel import OVModelForCausalLM
 
@@ -430,7 +439,7 @@ def run(self):
 
                 model_cls = OVModelForSpeechSeq2Seq
 
-            # To quantize a model with a dataset, an instance of a model class is required
+            # In this case, to apply quantization an instance of a model class is required
             model = model_cls.from_pretrained(
                 self.args.model,
                 export=True,
Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@`
`11`	`11`	`"cell_type": "markdown",`
`12`	`12`	`"metadata": {},`
`13`	`13`	`"source": [`
`14`		- "To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. You can set `export=True` to load a PyTorch checkpoint, export your model via TorchScript and apply IPEX optimizations : both operators optimization (replaced with customized IPEX operators) and graph-level optimization (like operators fusion) will be applied on your model."
	`14`	+ "To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. It could apply IPEX, providing optimizations like faster attention and operators fusion."
`15`	`15`	`]`
`16`	`16`	`},`
`17`	`17`	`{`
`@@ -60,7 +60,7 @@`
`60`	`60`	`}`
`61`	`61`	`],`
`62`	`62`	`"source": [`
`63`		`- "model = IPEXModelForCausalLM.from_pretrained(\"gpt2\", torch_dtype=torch.bfloat16, export=True)\n",`
	`63`	`+ "model = IPEXModelForCausalLM.from_pretrained(\"gpt2\", torch_dtype=torch.bfloat16)\n",`
`64`	`64`	`"tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",`
`65`	`65`	`"input_sentence = [\"Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?\"]\n",`
`66`	`66`	`"model_inputs = tokenizer(input_sentence, return_tensors=\"pt\")\n",`