Merge branch 'main' into varlen

huggingface · Dec 18, 2024 · 7e20b86 · 7e20b86
2 parents 3fdb3a5 + a76be08
commit 7e20b86
Show file tree

Hide file tree

Showing 24 changed files with 481 additions and 111 deletions.
diff --git a/.github/workflows/dockerfile_sanity.yml b/.github/workflows/dockerfile_sanity.yml
@@ -5,13 +5,13 @@ on:
     branches:
       - main
     paths:
-      - "docker/Dockerfile.intel"
-
+      - 'Dockerfile.ipex'
+ 
   pull_request:
     branches:
       - main
     paths:
-      - "docker/Dockerfile.intel"
+      - 'Dockerfile.ipex'
 
 jobs:
   build_and_run:
@@ -27,7 +27,7 @@ jobs:
       - name: Build and Run Docker Image
         run: |
           IMAGE_NAME="intel_image:latest"
-          docker build -f docker/Dockerfile.intel -t $IMAGE_NAME .
+          docker build -f Dockerfile.ipex -t $IMAGE_NAME .
           if [ $? -ne 0 ]; then
             echo "Docker image build failed."
             exit 1

diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
@@ -1,6 +1,7 @@
 name: OpenVINO - Test
 
 on:
+  workflow_dispatch:
   push:
     branches:
       - main
@@ -46,9 +47,9 @@ jobs:
           pip install .[openvino,openvino-tokenizers,diffusers,tests] transformers[testing]
 
       - if: ${{ matrix.transformers-version != 'latest' }}
-        name: Downgrade Transformers and Accelerate
+        name: Install specific dependencies and versions required for older transformers
         run: |
-          pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.*
+          pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.* diffusers==0.30.* transformers_stream_generator
 
       - if: ${{ matrix.test-pattern == '*modeling*' }}
         name: Uninstall NNCF

diff --git a/.github/workflows/test_openvino_slow.yml b/.github/workflows/test_openvino_slow.yml
@@ -46,8 +46,8 @@ jobs:
           pip uninstall -y nncf
 
       - if: ${{ matrix.transformers-version != 'latest' }}
-        name: Downgrade Transformers and Accelerate
-        run: pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.*
+        name: Install specific dependencies and versions required for older transformers
+        run: pip install transformers==${{ matrix.transformers-version }} accelerate==0.* peft==0.13.*, diffusers==0.30.* transformers_stream_generator
 
       - name: Pip freeze
         run: pip freeze

diff --git a/Dockerfile.ipex b/Dockerfile.ipex
@@ -0,0 +1,73 @@
+ARG PLATFORM=cpu
+
+FROM ubuntu:22.04 as cpu
+WORKDIR /usr/src/
+RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
+    sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
+    ca-certificates \
+    git \
+    curl \
+    vim \
+    build-essential \
+    ccache \
+    libgoogle-perftools-dev \
+    numactl \
+    cmake \
+    libjpeg-dev \
+    pybind11-dev \
+    libpng-dev \
+    python3 \
+    python3-pip \
+    && rm -rf /var/lib/apt/lists/*"
+RUN /usr/sbin/update-ccache-symlinks
+RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
+
+ARG IPEX_VERSION=2.5.0
+ARG PYTORCH_VERSION=2.5.1
+ARG TORCHVISION_VERSION=0.20.1+cpu
+ARG TORCHAUDIO_VERSION=2.5.1+cpu
+
+RUN python3 -m pip install --no-cache-dir \
+    torch==${PYTORCH_VERSION}+cpu \
+    torchvision==${TORCHVISION_VERSION} \
+    torchaudio==${TORCHAUDIO_VERSION} \
+    --index-url https://download.pytorch.org/whl/cpu && \
+    python3 -m pip install intel-openmp -f https://download.pytorch.org/whl/torch_stable.html && \
+    python3 -m pip install intel-extension-for-pytorch==$IPEX_VERSION && \
+    python3 -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/cn/ && \
+    python3 -m pip install --no-cache-dir  py-libnuma
+
+ARG KMP_BLOCKTIME=1
+ENV KMP_BLOCKTIME=${KMP_BLOCKTIME}
+ARG KMP_HW_SUBSET=1T
+ENV KMP_HW_SUBSET=${KMP_HW_SUBSET}
+ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc.so"
+
+FROM intel/intel-extension-for-pytorch:2.3.110-xpu as xpu
+WORKDIR /usr/src/
+
+RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
+    sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
+    ca-certificates \
+    git \
+    curl \
+    vim \
+    ccache \
+    libgoogle-perftools-dev \
+    numactl \
+    libjpeg-dev \
+    pybind11-dev \
+    libpng-dev \
+    && rm -rf /var/lib/apt/lists/*"
+RUN wget -qO - https://repositories.intel.com/gpu/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null
+
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
+| gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt install -y intel-basekit xpu-smi cmake ninja-build pciutils
+
+FROM ${PLATFORM}
+
+COPY optimum optimum
+COPY Makefile setup.cfg setup.py pyproject.toml README.md ./
+RUN pip install .
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@
 
 🤗 Optimum Intel is the interface between the 🤗 Transformers and Diffusers libraries and the different tools and libraries provided by Intel to accelerate end-to-end pipelines on Intel architectures.
 
-[Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) is an open-source library which provides optimizations for both eager mode and graph mode, however, compared to eager mode, graph mode in PyTorch* normally yields better performance from optimization techniques, such as operation fusion.
+[Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) is an open-source library which provides optimizations like faster attention and operators fusion.
 
 Intel [Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) is an open-source library enabling the usage of the most popular compression techniques such as quantization, pruning and knowledge distillation. It supports automatic accuracy-driven tuning strategies in order for users to easily generate quantized model. The users can easily apply static, dynamic and aware-training quantization approaches while giving an expected accuracy criteria. It also supports different weight pruning techniques enabling the creation of pruned model giving a predefined sparsity target.
 
@@ -159,7 +159,7 @@ optimized_model = OVModelForSequenceClassification.from_pretrained(save_dir)
 
 
 ## IPEX
-To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. You can set `export=True` to load a PyTorch checkpoint, export your model via TorchScript and apply IPEX optimizations : both operators optimization (replaced with customized IPEX operators) and graph-level optimization (like operators fusion) will be applied on your model.
+To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. It will load a PyTorch checkpoint, and apply IPEX operators optimization (replaced with customized IPEX operators).
 ```diff
   from transformers import AutoTokenizer, pipeline
 - from transformers import AutoModelForCausalLM
@@ -168,7 +168,7 @@ To load your IPEX model, you can just replace your `AutoModelForXxx` class with
 
   model_id = "gpt2"
 - model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
-+ model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, export=True)
++ model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
   tokenizer = AutoTokenizer.from_pretrained(model_id)
   pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
   results = pipe("He's a dreadful magician and")

diff --git a/docker/Dockerfile.intel b/docker/Dockerfile.intel
diff --git a/docs/source/ipex/inference.mdx b/docs/source/ipex/inference.mdx
@@ -14,7 +14,7 @@ Optimum Intel can be used to load models from the [Hub](https://huggingface.co/m
 
 ## Loading
 
-You can load your model and apply IPEX optimizations (apply torch.compile for non-generation tasks). For supported architectures like LLaMA, BERT and ViT, further optimizations will be applied by patching the model to use custom operators.
+You can load your model and apply IPEX optimizations (apply torch.compile except text-generation tasks). For supported architectures like LLaMA, BERT and ViT, further optimizations will be applied by patching the model to use custom operators.
 For now, support is enabled for Intel CPU/GPU. Previous models converted to TorchScript will be deprecated in v1.22.
 
 ```diff
@@ -43,3 +43,4 @@ As shown in the table below, each task is associated with a class enabling to au
 | `IPEXModelForMaskedLM`               | `fill-mask`                          |
 | `IPEXModelForAudioClassification`    | `audio-classification`               |
 | `IPEXModelForCausalLM`               | `text-generation`                    |
+| `IPEXModelForSeq2SeqLM`              | `text2text-generation`               |
diff --git a/docs/source/ipex/models.mdx b/docs/source/ipex/models.mdx
@@ -40,6 +40,7 @@ Here is the list of the supported architectures :
 - Roberta
 - Roformer
 - SqueezeBert
+- T5
 - UniSpeech
 - Vit
 - Wav2Vec2

diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx
@@ -78,7 +78,8 @@ Optional arguments:
   --ratio RATIO         A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit
                         quantization. If set to 0.8, 80% of the layers will be quantized to int4 while 20% will be
                         quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size
-                        and inference latency. Default value is 1.0.
+                        and inference latency. Default value is 1.0. Note: If dataset is provided, and the ratio is
+                        less than 1.0, then data-aware mixed precision assignment will be applied.
   --sym                 Whether to apply symmetric quantization
   --group-size GROUP_SIZE
                         The group size to use for quantization. Recommended value is 128 and -1 uses per-column
@@ -94,7 +95,9 @@ Optional arguments:
                         can use the one from the list ['auto','wikitext2','c4','c4-new']. With 'auto' the dataset will
                         be collected from model's generations. For diffusion models it should be on of
                         ['conceptual_captions','laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. For
-                        visual language models the dataset must be set to 'contextual'.
+                        visual language models the dataset must be set to 'contextual'. Note: if none of the data-aware
+                        compression algorithms are selected and ratio parameter is omitted or equals 1.0, the dataset
+                        argument will not have an effect on the resulting model.
   --all-layers          Whether embeddings and last MatMul layers should be compressed to INT4. If not provided an
                         weight compression is applied, they are compressed to INT8.
   --awq                 Whether to apply AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs, but

diff --git a/notebooks/ipex/text_generation.ipynb b/notebooks/ipex/text_generation.ipynb
@@ -11,7 +11,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. You can set `export=True` to load a PyTorch checkpoint, export your model via TorchScript and apply IPEX optimizations : both operators optimization (replaced with customized IPEX operators) and graph-level optimization (like operators fusion) will be applied on your model."
+    "To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. It could apply IPEX, providing optimizations like faster attention and operators fusion."
    ]
   },
   {
@@ -60,7 +60,7 @@
     }
    ],
    "source": [
-    "model = IPEXModelForCausalLM.from_pretrained(\"gpt2\", torch_dtype=torch.bfloat16, export=True)\n",
+    "model = IPEXModelForCausalLM.from_pretrained(\"gpt2\", torch_dtype=torch.bfloat16)\n",
     "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",
     "input_sentence = [\"Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?\"]\n",
     "model_inputs = tokenizer(input_sentence, return_tensors=\"pt\")\n",

diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
@@ -102,7 +102,8 @@ def parse_args_openvino(parser: "ArgumentParser"):
         default=None,
         help=(
             "A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80%% of the layers will be quantized to int4 "
-            "while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0."
+            "while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0. "
+            "Note: If dataset is provided, and the ratio is less than 1.0, then data-aware mixed precision assignment will be applied."
         ),
     )
     optional_group.add_argument(
@@ -140,7 +141,9 @@ def parse_args_openvino(parser: "ArgumentParser"):
             "dataset will be collected from model's generations. "
             "For diffusion models it should be on of ['conceptual_captions',"
             "'laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. "
-            "For visual language models the dataset must be set to 'contextual'."
+            "For visual language models the dataset must be set to 'contextual'. "
+            "Note: if none of the data-aware compression algorithms are selected and ratio parameter is omitted or "
+            "equals 1.0, the dataset argument will not have an effect on the resulting model."
         ),
     )
     optional_group.add_argument(

diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
@@ -474,9 +474,6 @@ class StoreAttr(object):
         from optimum.intel.openvino.quantization import _weight_only_quantization
 
         _weight_only_quantization(submodel, quantization_config)
-        if "text-generation" in task:
-            submodel.set_rt_info("u8", ["runtime_options", "KV_CACHE_PRECISION"])
-
         compressed_submodel_path = submodel_path.parent / f"{submodel_path.stem}_compressed.xml"
         save_model(submodel, compressed_submodel_path, compress_to_fp16=False)
         del submodel

diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py
@@ -1804,8 +1804,9 @@ def __init__(
         normalized_config: NormalizedVisionConfig,
         batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
         num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
-        width: int = DEFAULT_DUMMY_SHAPES["width"],
-        height: int = DEFAULT_DUMMY_SHAPES["height"],
+        width: int = DEFAULT_DUMMY_SHAPES["width"] // 4,
+        height: int = DEFAULT_DUMMY_SHAPES["height"] // 4,
+        # Reduce img shape by 4 for FLUX to reduce memory usage on conversion
         **kwargs,
     ):
         super().__init__(task, normalized_config, batch_size, num_channels, width, height, **kwargs)

diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py
@@ -51,8 +51,10 @@
         "IPEXModel",
     ]
 else:
+    _import_structure["utils.dummy_ipex_objects"] = []
     _import_structure["ipex"] = [
         "IPEXModelForCausalLM",
+        "IPEXModelForSeq2SeqLM",
         "IPEXModelForSequenceClassification",
         "IPEXModelForMaskedLM",
         "IPEXModelForTokenClassification",
@@ -247,6 +249,7 @@
             IPEXModelForImageClassification,
             IPEXModelForMaskedLM,
             IPEXModelForQuestionAnswering,
+            IPEXModelForSeq2SeqLM,
             IPEXModelForSequenceClassification,
             IPEXModelForTokenClassification,
         )

diff --git a/optimum/intel/ipex/__init__.py b/optimum/intel/ipex/__init__.py
@@ -20,6 +20,7 @@
     IPEXModelForImageClassification,
     IPEXModelForMaskedLM,
     IPEXModelForQuestionAnswering,
+    IPEXModelForSeq2SeqLM,
     IPEXModelForSequenceClassification,
     IPEXModelForTokenClassification,
 )