Skip to content

Commit 2ab28f0

Browse files
Merge branch 'main' into ns/whisper-cli-quantization
2 parents 6aefc75 + 93777ec commit 2ab28f0

33 files changed

+1310
-198
lines changed

.github/workflows/dockerfile_sanity.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@ on:
55
branches:
66
- main
77
paths:
8-
- "docker/Dockerfile.intel"
9-
8+
- 'Dockerfile.ipex'
9+
1010
pull_request:
1111
branches:
1212
- main
1313
paths:
14-
- "docker/Dockerfile.intel"
14+
- 'Dockerfile.ipex'
1515

1616
jobs:
1717
build_and_run:
@@ -27,7 +27,7 @@ jobs:
2727
- name: Build and Run Docker Image
2828
run: |
2929
IMAGE_NAME="intel_image:latest"
30-
docker build -f docker/Dockerfile.intel -t $IMAGE_NAME .
30+
docker build -f Dockerfile.ipex -t $IMAGE_NAME .
3131
if [ $? -ne 0 ]; then
3232
echo "Docker image build failed."
3333
exit 1

.github/workflows/test_openvino.yml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
name: OpenVINO - Test
22

33
on:
4+
workflow_dispatch:
45
push:
56
branches:
67
- main
@@ -43,12 +44,12 @@ jobs:
4344
run: |
4445
pip install --upgrade pip
4546
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
46-
pip install .[openvino,openvino-tokenizers,diffusers,tests,tests-openvino] transformers[testing]
47+
pip install .[openvino,openvino-tokenizers,diffusers,tests] transformers[testing]
4748
4849
- if: ${{ matrix.transformers-version != 'latest' }}
49-
name: Downgrade Transformers and Accelerate
50+
name: Install specific dependencies and versions required for older transformers
5051
run: |
51-
pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.*
52+
pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.30.* transformers_stream_generator
5253
5354
- if: ${{ matrix.test-pattern == '*modeling*' }}
5455
name: Uninstall NNCF

.github/workflows/test_openvino_full.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ jobs:
5656
python -m pip install --upgrade pip
5757
# Install PyTorch CPU to prevent unnecessary downloading/installing of CUDA packages
5858
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
59-
pip install .[tests,tests-openvino]
59+
pip install .[tests]
6060
6161
- name: Install openvino-nightly
6262
if: ${{ matrix.openvino == 'ov-nightly' }}

.github/workflows/test_openvino_slow.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,12 +42,12 @@ jobs:
4242
run: |
4343
pip install --upgrade pip
4444
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
45-
pip install .[openvino,tests,tests-openvino] transformers[testing]
45+
pip install .[openvino,tests] transformers[testing]
4646
pip uninstall -y nncf
4747
4848
- if: ${{ matrix.transformers-version != 'latest' }}
49-
name: Downgrade Transformers and Accelerate
50-
run: pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.*
49+
name: Install specific dependencies and versions required for older transformers
50+
run: pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.*, diffusers==0.30.* transformers_stream_generator
5151

5252
- name: Pip freeze
5353
run: pip freeze

Dockerfile.ipex

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
ARG PLATFORM=cpu
2+
3+
FROM ubuntu:22.04 as cpu
4+
WORKDIR /usr/src/
5+
RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
6+
sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
7+
ca-certificates \
8+
git \
9+
curl \
10+
vim \
11+
build-essential \
12+
ccache \
13+
libgoogle-perftools-dev \
14+
numactl \
15+
cmake \
16+
libjpeg-dev \
17+
pybind11-dev \
18+
libpng-dev \
19+
python3 \
20+
python3-pip \
21+
&& rm -rf /var/lib/apt/lists/*"
22+
RUN /usr/sbin/update-ccache-symlinks
23+
RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
24+
25+
ARG IPEX_VERSION=2.5.0
26+
ARG PYTORCH_VERSION=2.5.1
27+
ARG TORCHVISION_VERSION=0.20.1+cpu
28+
ARG TORCHAUDIO_VERSION=2.5.1+cpu
29+
30+
RUN python3 -m pip install --no-cache-dir \
31+
torch==${PYTORCH_VERSION}+cpu \
32+
torchvision==${TORCHVISION_VERSION} \
33+
torchaudio==${TORCHAUDIO_VERSION} \
34+
--index-url https://download.pytorch.org/whl/cpu && \
35+
python3 -m pip install intel-openmp -f https://download.pytorch.org/whl/torch_stable.html && \
36+
python3 -m pip install intel-extension-for-pytorch==$IPEX_VERSION && \
37+
python3 -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/cn/ && \
38+
python3 -m pip install --no-cache-dir py-libnuma
39+
40+
ARG KMP_BLOCKTIME=1
41+
ENV KMP_BLOCKTIME=${KMP_BLOCKTIME}
42+
ARG KMP_HW_SUBSET=1T
43+
ENV KMP_HW_SUBSET=${KMP_HW_SUBSET}
44+
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so"
45+
46+
FROM intel/intel-extension-for-pytorch:2.3.110-xpu as xpu
47+
WORKDIR /usr/src/
48+
49+
RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
50+
sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
51+
ca-certificates \
52+
git \
53+
curl \
54+
vim \
55+
ccache \
56+
libgoogle-perftools-dev \
57+
numactl \
58+
libjpeg-dev \
59+
pybind11-dev \
60+
libpng-dev \
61+
&& rm -rf /var/lib/apt/lists/*"
62+
RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null
63+
64+
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
65+
| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
66+
67+
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y intel-basekit xpu-smi cmake ninja-build pciutils
68+
69+
FROM ${PLATFORM}
70+
71+
COPY optimum optimum
72+
COPY Makefile setup.cfg setup.py pyproject.toml README.md ./
73+
RUN pip install .

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
🤗 Optimum Intel is the interface between the 🤗 Transformers and Diffusers libraries and the different tools and libraries provided by Intel to accelerate end-to-end pipelines on Intel architectures.
88

9-
[Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) is an open-source library which provides optimizations for both eager mode and graph mode, however, compared to eager mode, graph mode in PyTorch* normally yields better performance from optimization techniques, such as operation fusion.
9+
[Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) is an open-source library which provides optimizations like faster attention and operators fusion.
1010

1111
Intel [Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) is an open-source library enabling the usage of the most popular compression techniques such as quantization, pruning and knowledge distillation. It supports automatic accuracy-driven tuning strategies in order for users to easily generate quantized model. The users can easily apply static, dynamic and aware-training quantization approaches while giving an expected accuracy criteria. It also supports different weight pruning techniques enabling the creation of pruned model giving a predefined sparsity target.
1212

@@ -159,7 +159,7 @@ optimized_model = OVModelForSequenceClassification.from_pretrained(save_dir)
159159

160160

161161
## IPEX
162-
To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. You can set `export=True` to load a PyTorch checkpoint, export your model via TorchScript and apply IPEX optimizations : both operators optimization (replaced with customized IPEX operators) and graph-level optimization (like operators fusion) will be applied on your model.
162+
To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. It will load a PyTorch checkpoint, and apply IPEX operators optimization (replaced with customized IPEX operators).
163163
```diff
164164
from transformers import AutoTokenizer, pipeline
165165
- from transformers import AutoModelForCausalLM
@@ -168,7 +168,7 @@ To load your IPEX model, you can just replace your `AutoModelForXxx` class with
168168

169169
model_id = "gpt2"
170170
- model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
171-
+ model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, export=True)
171+
+ model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
172172
tokenizer = AutoTokenizer.from_pretrained(model_id)
173173
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
174174
results = pipe("He's a dreadful magician and")

docker/Dockerfile.intel

Lines changed: 0 additions & 53 deletions
This file was deleted.

docs/source/ipex/inference.mdx

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ Optimum Intel can be used to load models from the [Hub](https://huggingface.co/m
1414

1515
## Loading
1616

17-
You can load your model and apply IPEX optimizations (apply torch.compile for non-generation tasks). For supported architectures like LLaMA, BERT and ViT, further optimizations will be applied by patching the model to use custom operators.
17+
You can load your model and apply IPEX optimizations (apply torch.compile except text-generation tasks). For supported architectures like LLaMA, BERT and ViT, further optimizations will be applied by patching the model to use custom operators.
1818
For now, support is enabled for Intel CPU/GPU. Previous models converted to TorchScript will be deprecated in v1.22.
1919

2020
```diff
@@ -43,3 +43,4 @@ As shown in the table below, each task is associated with a class enabling to au
4343
| `IPEXModelForMaskedLM` | `fill-mask` |
4444
| `IPEXModelForAudioClassification` | `audio-classification` |
4545
| `IPEXModelForCausalLM` | `text-generation` |
46+
| `IPEXModelForSeq2SeqLM` | `text2text-generation` |

docs/source/ipex/models.mdx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ Here is the list of the supported architectures :
4040
- Roberta
4141
- Roformer
4242
- SqueezeBert
43+
- T5
4344
- UniSpeech
4445
- Vit
4546
- Wav2Vec2

docs/source/openvino/export.mdx

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -83,14 +83,15 @@ Optional arguments:
8383
--ratio RATIO A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit
8484
quantization. If set to 0.8, 80% of the layers will be quantized to int4 while 20% will be
8585
quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size
86-
and inference latency. Default value is 1.0.
86+
and inference latency. Default value is 1.0. Note: If dataset is provided, and the ratio is
87+
less than 1.0, then data-aware mixed precision assignment will be applied.
8788
--sym Whether to apply symmetric quantization
8889
--group-size GROUP_SIZE
8990
The group size to use for quantization. Recommended value is 128 and -1 uses per-column
9091
quantization.
9192
--backup-precision {none,int8_sym,int8_asym}
92-
Defines a backup precision for mixed-precision weight compression. Only valid for int4 weight
93-
format. If not provided, backup precision is int8_asym. 'none' stands for original floating-
93+
Defines a backup precision for mixed-precision weight compression. Only valid for 4-bit weight
94+
formats. If not provided, backup precision is int8_asym. 'none' stands for original floating-
9495
point precision of the model weights, in this case weights are retained in their original
9596
precision without any quantization. 'int8_sym' stands for 8-bit integer symmetric quantization
9697
without zero point. 'int8_asym' stands for 8-bit integer asymmetric quantization with zero
@@ -99,7 +100,9 @@ Optional arguments:
99100
can use the one from the list ['auto','wikitext2','c4','c4-new']. With 'auto' the dataset will
100101
be collected from model's generations. For diffusion models it should be on of
101102
['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. For
102-
visual language models the dataset must be set to 'contextual'.
103+
visual language models the dataset must be set to 'contextual'. Note: if none of the data-aware
104+
compression algorithms are selected and ratio parameter is omitted or equals 1.0, the dataset
105+
argument will not have an effect on the resulting model.
103106
--all-layers Whether embeddings and last MatMul layers should be compressed to INT4. If not provided an
104107
weight compression is applied, they are compressed to INT8.
105108
--awq Whether to apply AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs, but

notebooks/ipex/text_generation.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
"cell_type": "markdown",
1212
"metadata": {},
1313
"source": [
14-
"To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. You can set `export=True` to load a PyTorch checkpoint, export your model via TorchScript and apply IPEX optimizations : both operators optimization (replaced with customized IPEX operators) and graph-level optimization (like operators fusion) will be applied on your model."
14+
"To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. It could apply IPEX, providing optimizations like faster attention and operators fusion."
1515
]
1616
},
1717
{
@@ -60,7 +60,7 @@
6060
}
6161
],
6262
"source": [
63-
"model = IPEXModelForCausalLM.from_pretrained(\"gpt2\", torch_dtype=torch.bfloat16, export=True)\n",
63+
"model = IPEXModelForCausalLM.from_pretrained(\"gpt2\", torch_dtype=torch.bfloat16)\n",
6464
"tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",
6565
"input_sentence = [\"Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?\"]\n",
6666
"model_inputs = tokenizer(input_sentence, return_tensors=\"pt\")\n",

optimum/commands/export/openvino.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,8 @@ def parse_args_openvino(parser: "ArgumentParser"):
112112
default=None,
113113
help=(
114114
"A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80%% of the layers will be quantized to int4 "
115-
"while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0."
115+
"while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0. "
116+
"Note: If dataset is provided, and the ratio is less than 1.0, then data-aware mixed precision assignment will be applied."
116117
),
117118
)
118119
optional_group.add_argument(
@@ -133,7 +134,7 @@ def parse_args_openvino(parser: "ArgumentParser"):
133134
choices=["none", "int8_sym", "int8_asym"],
134135
default=None,
135136
help=(
136-
"Defines a backup precision for mixed-precision weight compression. Only valid for int4 weight format. "
137+
"Defines a backup precision for mixed-precision weight compression. Only valid for 4-bit weight formats. "
137138
"If not provided, backup precision is int8_asym. 'none' stands for original floating-point precision of "
138139
"the model weights, in this case weights are retained in their original precision without any "
139140
"quantization. 'int8_sym' stands for 8-bit integer symmetric quantization without zero point. 'int8_asym' "
@@ -150,7 +151,9 @@ def parse_args_openvino(parser: "ArgumentParser"):
150151
"dataset will be collected from model's generations. "
151152
"For diffusion models it should be on of ['conceptual_captions',"
152153
"'laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. "
153-
"For visual language models the dataset must be set to 'contextual'."
154+
"For visual language models the dataset must be set to 'contextual'. "
155+
"Note: if none of the data-aware compression algorithms are selected and ratio parameter is omitted or "
156+
"equals 1.0, the dataset argument will not have an effect on the resulting model."
154157
),
155158
)
156159
optional_group.add_argument(
@@ -407,6 +410,10 @@ def run(self):
407410
from optimum.intel import OVStableDiffusion3Pipeline
408411

409412
model_cls = OVStableDiffusion3Pipeline
413+
elif class_name == "FluxPipeline":
414+
from optimum.intel import OVFluxPipeline
415+
416+
model_cls = OVFluxPipeline
410417
else:
411418
raise NotImplementedError(f"Quantization in hybrid mode isn't supported for class {class_name}.")
412419

@@ -415,8 +422,10 @@ def run(self):
415422
if not self.args.disable_convert_tokenizer:
416423
maybe_convert_tokenizers(library_name, self.args.output, model, task=task)
417424
elif (
418-
task.startswith("text-generation") or task in ["automatic-speech-recognition", "image-text-to-text"]
419-
) and quantize_with_dataset:
425+
quantize_with_dataset
426+
and (task.startswith("text-generation") or task == "automatic-speech-recognition")
427+
or (task == "image-text-to-text" and quantization_config is not None)
428+
):
420429
if task.startswith("text-generation"):
421430
from optimum.intel import OVModelForCausalLM
422431

@@ -430,7 +439,7 @@ def run(self):
430439

431440
model_cls = OVModelForSpeechSeq2Seq
432441

433-
# To quantize a model with a dataset, an instance of a model class is required
442+
# In this case, to apply quantization an instance of a model class is required
434443
model = model_cls.from_pretrained(
435444
self.args.model,
436445
export=True,

0 commit comments

Comments
 (0)