diff --git a/.github/workflows/delete_doc_comment.yml b/.github/workflows/delete_doc_comment.yml deleted file mode 100644 index 768c348c7a..0000000000 --- a/.github/workflows/delete_doc_comment.yml +++ /dev/null @@ -1,18 +0,0 @@ -name: Delete PR documentation - -on: - workflow_run: - workflows: ["Delete doc comment trigger"] - types: - - completed - paths: - - "optimum/**.py" - - "docs/**" - - ".github/workflows/build_pr_documentation.yml" - - ".github/workflows/delete_doc_comment.yml" - -jobs: - delete: - uses: huggingface/doc-builder/.github/workflows/delete_doc_comment.yml@main - secrets: - comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }} \ No newline at end of file diff --git a/.github/workflows/delete_doc_comment_trigger.yml b/.github/workflows/delete_doc_comment_trigger.yml deleted file mode 100644 index f87d9bd4dc..0000000000 --- a/.github/workflows/delete_doc_comment_trigger.yml +++ /dev/null @@ -1,12 +0,0 @@ -name: Delete doc comment trigger - -on: - pull_request: - types: [ closed ] - - -jobs: - delete: - uses: huggingface/doc-builder/.github/workflows/delete_doc_comment_trigger.yml@main - with: - pr_number: ${{ github.event.number }} diff --git a/.github/workflows/test_inc.yml b/.github/workflows/test_inc.yml index fd5fd16509..3a15214f99 100644 --- a/.github/workflows/test_inc.yml +++ b/.github/workflows/test_inc.yml @@ -30,7 +30,8 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install .[neural-compressor,ipex,diffusers,tests] + pip install .[neural-compressor,diffusers,tests] + pip install intel-extension-for-pytorch - name: Test with Pytest run: | pytest tests/neural_compressor/ diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml index cb58f412a6..d43cabe323 100644 --- a/.github/workflows/test_openvino.yml +++ b/.github/workflows/test_openvino.yml @@ -36,3 +36,9 @@ jobs: - name: Test with Pytest run: | pytest tests/openvino/ --ignore test_modeling_basic + - name: Test openvino-nightly import + run: | + pip uninstall -y openvino + pip install openvino-nightly + python -c "from optimum.intel import OVModelForCausalLM; OVModelForCausalLM.from_pretrained('hf-internal-testing/tiny-random-gpt2', export=True, compile=False)" + diff --git a/README.md b/README.md index e06f91ef17..54d8371b5b 100644 --- a/README.md +++ b/README.md @@ -67,26 +67,52 @@ For more details on the supported compression techniques, please refer to the [d Below are the examples of how to use OpenVINO and its [NNCF](https://docs.openvino.ai/latest/tmo_introduction.html) framework to accelerate inference. +#### Export: + +It is possible to export your model to the [OpenVINO](https://docs.openvino.ai/2023.1/openvino_ir.html) IR format with the CLI : + +```plain +optimum-cli export openvino --model gpt2 ov_model +``` + +If you add `--int8`, the model linear and embedding weights will be quantized to INT8, the activations will be kept in floating point precision. + +```plain +optimum-cli export openvino --model gpt2 --int8 ov_model +``` + +To apply quantization on both weights and activations, you can find more information in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov). + #### Inference: To load a model and run inference with OpenVINO Runtime, you can just replace your `AutoModelForXxx` class with the corresponding `OVModelForXxx` class. -If you want to load a PyTorch checkpoint, set `export=True` to convert your model to the OpenVINO IR. + ```diff -- from transformers import AutoModelForSequenceClassification -+ from optimum.intel import OVModelForSequenceClassification +- from transformers import AutoModelForSeq2SeqLM ++ from optimum.intel import OVModelForSeq2SeqLM from transformers import AutoTokenizer, pipeline - model_id = "distilbert-base-uncased-finetuned-sst-2-english" -- model = AutoModelForSequenceClassification.from_pretrained(model_id) -+ model = OVModelForSequenceClassification.from_pretrained(model_id, export=True) + model_id = "echarlaix/t5-small-openvino" +- model = AutoModelForSeq2SeqLM.from_pretrained(model_id) ++ model = OVModelForSeq2SeqLM.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) - model.save_pretrained("./distilbert") + pipe = pipeline("translation_en_to_fr", model=model, tokenizer=tokenizer) + results = pipe("He never went out without a book under his arm, and he often came back with two.") - classifier = pipeline("text-classification", model=model, tokenizer=tokenizer) - results = classifier("He's a dreadful magician.") + [{'translation_text': "Il n'est jamais sorti sans un livre sous son bras, et il est souvent revenu avec deux."}] ``` +If you want to load a PyTorch checkpoint, set `export=True` to convert your model to the OpenVINO IR. + +```python +from optimum.intel import OVModelForCausalLM + +model = OVModelForCausalLM.from_pretrained("gpt2", export=True) +model.save_pretrained("./ov_model") +``` + + #### Post-training static quantization: Post-training static quantization introduces an additional calibration step where data is fed through the network in order to compute the activations quantization parameters. Here is an example on how to apply static quantization on a fine-tuned DistilBERT. diff --git a/docs/source/inference.mdx b/docs/source/inference.mdx index f0a6d2edab..e93c39882a 100644 --- a/docs/source/inference.mdx +++ b/docs/source/inference.mdx @@ -11,34 +11,82 @@ specific language governing permissions and limitations under the License. Optimum Intel can be used to load optimized models from the [Hugging Face Hub](https://huggingface.co/models?library=openvino&sort=downloads) and create pipelines to run inference with OpenVINO Runtime without rewriting your APIs. -## Switching from Transformers to Optimum +## Transformers models You can now easily perform inference with OpenVINO Runtime on a variety of Intel processors ([see](https://docs.openvino.ai/latest/openvino_docs_OV_UG_supported_plugins_Supported_Devices.html) the full list of supported devices). For that, just replace the `AutoModelForXxx` class with the corresponding `OVModelForXxx` class. -To load a Transformers model and convert it to the OpenVINO format on-the-fly, you can set `export=True` when loading your model. -Here is an example on how to perform inference with OpenVINO Runtime for a text classification class: +As shown in the table below, each task is associated with a class enabling to automatically load your model. + +| Task | Auto Class | +|--------------------------------------|--------------------------------------| +| `text-classification` | `OVModelForSequenceClassification` | +| `token-classification` | `OVModelForTokenClassification` | +| `question-answering` | `OVModelForQuestionAnswering` | +| `audio-classification` | `OVModelForAudioClassification` | +| `image-classification` | `OVModelForImageClassification` | +| `feature-extraction` | `OVModelForFeatureExtraction` | +| `fill-mask` | `OVModelForMaskedLM` | +| `text-generation` | `OVModelForCausalLM` | +| `text2text-generation` | `OVModelForSeq2SeqLM` | + + +### Export + +It is possible to export your model to the [OpenVINO](https://docs.openvino.ai/2023.1/openvino_ir.html) IR format with the CLI : + +```bash +optimum-cli export openvino --model gpt2 ov_model +``` + +The example above illustrates exporting a checkpoint from the 🤗 Hub. When exporting a local model, first make sure that you saved both the model’s weights and tokenizer files in the same directory (`local_path`). +When using CLI, pass the `local_path` to the model argument instead of the checkpoint name of the model hosted on the Hub and provide the `--task` argument. You can review the list of supported tasks in the 🤗 [Optimum documentation](https://huggingface.co/docs/optimum/exporters/task_manager). If task argument is not provided, it will default to the model architecture without any task specific head. +Here we set the `task` to `text-generation-with-past`, with the `-with-past` suffix enabling the re-use of the pre-computed key/values hidden-states `use_cache=True`. + +```bash +optimum-cli export openvino --model local_path --task text-generation-with-past ov_model +``` + +Once the model is exported, you can load the OpenVINO model using : + +```python +from optimum.intel import AutoModelForCausalLM + +model_id = "helenai/gpt2-ov" +model = AutoModelForCausalLM.from_pretrained(model_id) +``` + +You can also load your PyTorch checkpoint and convert it to the OpenVINO format on-the-fly, by setting `export=True` when loading your model. + +```python +from optimum.intel import AutoModelForCausalLM + +model_id = "gpt2" +model = AutoModelForCausalLM.from_pretrained(model_id, export=True) +model.save_pretrained("ov_model") +``` + +### Inference + +You can load an OpenVINO hosted on the hub and perform inference, no need to adapt your code to get it to work with `OVModelForXxx` classes: ```diff -- from transformers import AutoModelForSequenceClassification -+ from optimum.intel import OVModelForSequenceClassification +- from transformers import AutoModelForCausalLM ++ from optimum.intel import OVModelForCausalLM from transformers import AutoTokenizer, pipeline - model_id = "distilbert-base-uncased-finetuned-sst-2-english" -- model = AutoModelForSequenceClassification.from_pretrained(model_id) -+ model = OVModelForSequenceClassification.from_pretrained(model_id, export=True) + model_id = "helenai/gpt2-ov" +- model = AutoModelForCausalLM.from_pretrained(model_id) ++ model = OVModelForCausalLM.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) - cls_pipe = pipeline("text-classification", model=model, tokenizer=tokenizer) - outputs = cls_pipe("He's a dreadful magician.") - - [{'label': 'NEGATIVE', 'score': 0.9919503927230835}] + pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) + results = pipe("He's a dreadful magician and") ``` See the [reference documentation](reference_ov) for more information about parameters, and examples for different tasks. To easily save the resulting model, you can use the `save_pretrained()` method, which will save both the BIN and XML files describing the graph. It is useful to save the tokenizer to the same directory, to enable easy loading of the tokenizer for the model. - ```python # Save the exported model save_directory = "openvino_distilbert" @@ -46,6 +94,28 @@ model.save_pretrained(save_directory) tokenizer.save_pretrained(save_directory) ``` +### Weight only quantization + +You can also apply INT8 quantization on your models weights when exporting your model with the CLI: + +```bash +optimum-cli export openvino --model gpt2 --int8 ov_model +``` + +This will results in the exported model linear and embedding layers to be quantized to INT8, the activations will be kept in floating point precision. + +This can also be done when loading your model by setting the `load_in_8bit` argument when calling the `from_pretrained()` method. + +```python +from optimum.intel import OVModelForCausalLM + +model = OVModelForCausalLM.from_pretrained(model_id, load_in_8bit=True) +``` + +To apply quantization on both weights and activations, you can use the `OVQuantizer`, more information in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov#optimization). + +### Static shape + By default, `OVModelForXxx` support dynamic shapes, enabling inputs of every shapes. To speed up inference, static shapes can be enabled by giving the desired inputs shapes. ```python @@ -55,7 +125,6 @@ model.reshape(1, 9) model.compile() ``` - When fixing the shapes with the `reshape()` method, inference cannot be performed with an input of a different shape. When instantiating your pipeline, you can specify the maximum total input sequence length after tokenization in order for shorter sequences to be padded and for longer sequences to be truncated. ```python @@ -81,16 +150,7 @@ qa_pipe = pipeline( metric = task_evaluator.compute(model_or_pipeline=qa_pipe, data=eval_dataset, metric="squad") ``` - -To run inference on Intel integrated or discrete GPU, use `.to("gpu")`. On GPU, models run in FP16 precision by default. (See [OpenVINO documentation](https://docs.openvino.ai/nightly/openvino_docs_install_guides_configurations_for_intel_gpu.html) about installing drivers for GPU inference). - -```python -# Static shapes speed up inference -model.reshape(1, 9) -model.to("gpu") -# Compile the model before the first inference -model.compile() -``` +### Compilation By default the model will be compiled when instantiating our `OVModel`. In the case where the model is reshaped or placed to another device, the model will need to be recompiled again, which will happen by default before the first inference (thus inflating the latency of the first inference). To avoid an unnecessary compilation, you can disable the first compilation by setting `compile=False`. The model can be compiled before the first inference with `model.compile()`. @@ -106,6 +166,19 @@ model.reshape(1,128) model.compile() ``` +To run inference on Intel integrated or discrete GPU, use `.to("gpu")`. On GPU, models run in FP16 precision by default. (See [OpenVINO documentation](https://docs.openvino.ai/nightly/openvino_docs_install_guides_configurations_for_intel_gpu.html) about installing drivers for GPU inference). + +```python +# Static shapes speed up inference +model.reshape(1, 9) +model.to("gpu") +# Compile the model before the first inference +model.compile() +``` + +### Configuration + + It is possible to pass an `ov_config` parameter to `from_pretrained()` with custom OpenVINO configuration values. This can be used for example to enable full precision inference on devices where FP16 or BF16 inference precision is used by default. @@ -120,7 +193,7 @@ Optimum Intel leverages OpenVINO's model caching to speed up model compiling. By model = OVModelForSequenceClassification.from_pretrained(model_id, ov_config={"CACHE_DIR":""}) ``` -## Sequence-to-sequence models +### Sequence-to-sequence models Sequence-to-sequence (Seq2Seq) models, that generate a new sequence from an input, can also be used when running inference with OpenVINO. When Seq2Seq models are exported to the OpenVINO IR, they are decomposed into two parts : the encoder and the "decoder" (which actually consists of the decoder with the language modeling head), that are later combined during inference. To speed up sequential decoding, a cache with pre-computed key/values hidden-states will be used by default. An additional model component will be exported: the "decoder" with pre-computed key/values as one of its inputs. This specific export comes from the fact that during the first pass, the decoder has no pre-computed key/values hidden-states, while during the rest of the generation past key/values will be used to speed up sequential decoding. To disable this cache, set `use_cache=False` in the `from_pretrained()` method. @@ -147,23 +220,33 @@ tokenizer.save_pretrained(save_directory) [{'translation_text': "Il n'est jamais sorti sans un livre sous son bras, et il est souvent revenu avec deux."}] ``` -## Stable Diffusion +## Diffusers models + +Make sure you have 🤗 Diffusers installed. + +To install `diffusers`: +```bash +pip install optimum[diffusers] +``` + + +### Stable Diffusion Stable Diffusion models can also be used when running inference with OpenVINO. When Stable Diffusion models -are exported to the OpenVINO format, they are decomposed into three components that are later combined during inference: +are exported to the OpenVINO format, they are decomposed into different components that are later combined during inference: - The text encoder - The U-NET - The VAE encoder - The VAE decoder -Make sure you have 🤗 Diffusers installed. +| Task | Auto Class | +|--------------------------------------|--------------------------------------| +| `text-to-image` | `OVStableDiffusionPipeline` | +| `image-to-image` | `OVStableDiffusionImg2ImgPipeline` | +| `inpaint` | `OVStableDiffusionInpaintPipeline` | -To install `diffusers`: -```bash -pip install optimum[diffusers] -``` -### Text-to-Image +#### Text-to-Image Here is an example of how you can load an OpenVINO Stable Diffusion model and run inference using OpenVINO Runtime: ```python @@ -208,7 +291,7 @@ In case you want to change any parameters such as the outputs height or width, y -### Text-to-Image with Textual Inversion +#### Text-to-Image with Textual Inversion Here is an example of how you can load an OpenVINO Stable Diffusion model with pre-trained textual inversion embeddings and run inference using OpenVINO Runtime: @@ -248,7 +331,7 @@ The left image shows the generation result of original stable diffusion v1.5, th | ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/textual_inversion/stable_diffusion_v1_5_without_textual_inversion.png) | ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/textual_inversion/stable_diffusion_v1_5_with_textual_inversion.png) | -### Image-to-Image +#### Image-to-Image ```python import requests @@ -269,16 +352,15 @@ image = pipeline(prompt=prompt, image=init_image, strength=0.75, guidance_scale= image.save("fantasy_landscape.png") ``` -## Stable Diffusion XL +### Stable Diffusion XL -Before using `OVtableDiffusionXLPipeline` make sure to have `diffusers` and `invisible_watermark` installed. You can install the libraries as follows: +| Task | Auto Class | +|--------------------------------------|--------------------------------------| +| `text-to-image` | `OVStableDiffusionXLPipeline` | +| `image-to-image` | `OVStableDiffusionXLImg2ImgPipeline` | -```bash -pip install diffusers -pip install invisible-watermark>=0.2.0 -``` -### Text-to-Image +#### Text-to-Image Here is an example of how you can load a SDXL OpenVINO model from [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) and run inference using OpenVINO Runtime: @@ -296,7 +378,7 @@ image.save("train_station.png") |---|---| | ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/sd_xl/train_station_friedrich.png) | ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/sd_xl/train_station_friedrich_2.png) | -### Text-to-Image with Textual Inversion +#### Text-to-Image with Textual Inversion Here is an example of how you can load an SDXL OpenVINO model from [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) with pre-trained textual inversion embeddings and run inference using OpenVINO Runtime: @@ -338,7 +420,7 @@ The left image shows the generation result of the original SDXL base 1.0, the ri | ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/textual_inversion/sdxl_without_textual_inversion.png) | ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/textual_inversion/sdxl_with_textual_inversion.png) | -### Image-to-Image +#### Image-to-Image Here is an example of how you can load a PyTorch SDXL model, convert it to OpenVINO on-the-fly and run inference using OpenVINO Runtime for *image-to-image*: @@ -358,7 +440,7 @@ pipeline.save_pretrained("openvino-sd-xl-refiner-1.0") ``` -### Refining the image output +#### Refining the image output The image can be refined by making use of a model like [stabilityai/stable-diffusion-xl-refiner-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0). In this case, you only have to output the latents from the base model. @@ -374,25 +456,23 @@ image = refiner(prompt=prompt, image=image[None, :]).images[0] ``` - -## Supported tasks - -As shown in the table below, each task is associated with a class enabling to automatically load your model. +## Latent Consistency Models | Task | Auto Class | |--------------------------------------|--------------------------------------| -| `text-classification` | `OVModelForSequenceClassification` | -| `token-classification` | `OVModelForTokenClassification` | -| `question-answering` | `OVModelForQuestionAnswering` | -| `audio-classification` | `OVModelForAudioClassification` | -| `image-classification` | `OVModelForImageClassification` | -| `feature-extraction` | `OVModelForFeatureExtraction` | -| `fill-mask` | `OVModelForMaskedLM` | -| `text-generation` | `OVModelForCausalLM` | -| `text2text-generation` | `OVModelForSeq2SeqLM` | -| `text-to-image` | `OVStableDiffusionPipeline` | -| `text-to-image` | `OVStableDiffusionXLPipeline` | -| `image-to-image` | `OVStableDiffusionImg2ImgPipeline` | -| `image-to-image` | `OVStableDiffusionXLImg2ImgPipeline` | -| `inpaint` | `OVStableDiffusionInpaintPipeline` | +| `text-to-image` | `OVLatentConsistencyModelPipeline` | + + +### Text-to-Image + +Here is an example of how you can load a Latent Consistency Models (LCMs) from [SimianLuo/LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) and run inference using OpenVINO : + +```python +from optimum.intel import OVLatentConsistencyModelPipeline + +model_id = "SimianLuo/LCM_Dreamshaper_v7" +pipeline = OVLatentConsistencyModelPipeline.from_pretrained(model_id, export=True) +prompt = "sailing ship in storm by Leonardo da Vinci" +images = pipeline(prompt, num_inference_steps=4, guidance_scale=8.0).images +``` diff --git a/docs/source/optimization_ov.mdx b/docs/source/optimization_ov.mdx index 866573dca9..3c41760c21 100644 --- a/docs/source/optimization_ov.mdx +++ b/docs/source/optimization_ov.mdx @@ -62,7 +62,6 @@ tokenizer.save_pretrained(save_dir) The `quantize()` method applies post-training static quantization and export the resulting quantized model to the OpenVINO Intermediate Representation (IR). The resulting graph is represented with two files: an XML file describing the network topology and a binary file describing the weights. The resulting model can be run on any target Intel device. - ## Training-time optimization Apart from optimizing a model after training like post-training quantization above, `optimum.openvino` also provides optimization methods during training, namely Quantization-Aware Training (QAT) and Joint Pruning, Quantization and Distillation (JPQD). @@ -221,4 +220,4 @@ text = "He's a dreadful magician." outputs = cls_pipe(text) [{'label': 'NEGATIVE', 'score': 0.9840195178985596}] -``` \ No newline at end of file +``` diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 75d8db8f00..95ecea1213 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -13,6 +13,7 @@ # limitations under the License. """Defines the command line for the export with OpenVINO.""" +import logging import sys from pathlib import Path from typing import TYPE_CHECKING, Optional @@ -21,6 +22,9 @@ from ..base import BaseOptimumCLICommand, CommandInfo +logger = logging.getLogger(__name__) + + if TYPE_CHECKING: from argparse import ArgumentParser, Namespace, _SubParsersAction @@ -68,8 +72,26 @@ def parse_args_openvino(parser: "ArgumentParser"): "This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it." ), ) - optional_group.add_argument("--fp16", action="store_true", help="Compress weights to fp16"), - optional_group.add_argument("--int8", action="store_true", help="Compress weights to int8"), + optional_group.add_argument("--fp16", action="store_true", help="Compress weights to fp16") + optional_group.add_argument("--int8", action="store_true", help="Compress weights to int8") + optional_group.add_argument( + "--weight-format", + type=str, + choices=["fp32", "fp16", "int8", "int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"], + default=None, + help=( + "The weight format of the exporting model, e.g. f32 stands for float32 weights, f16 - for float16 weights, i8 - INT8 weights, int4_* - for INT4 compressed weights." + ), + ) + optional_group.add_argument( + "--ratio", + type=float, + default=0.8, + help=( + "Compression ratio between primary and backup precision. In the case of INT4, NNCF evaluates layer sensitivity and keeps the most impactful layers in INT8" + "precision (by default 20% in INT8). This helps to achieve better accuracy after weight quantization." + ), + ) class OVExportCommand(BaseOptimumCLICommand): @@ -95,6 +117,17 @@ def parse_args(parser: "ArgumentParser"): def run(self): from ...exporters.openvino.__main__ import main_export + if self.args.fp16: + logger.warning( + "`--fp16` option is deprecated and will be removed in a future version. Use `--weight-format` instead." + ) + self.args.weight_format = "fp16" + if self.args.int8: + logger.warning( + "`--int8` option is deprecated and will be removed in a future version. Use `--weight-format` instead." + ) + self.args.weight_format = "int8" + # TODO : add input shapes main_export( model_name_or_path=self.args.model, @@ -104,7 +137,7 @@ def run(self): cache_dir=self.args.cache_dir, trust_remote_code=self.args.trust_remote_code, pad_token_id=self.args.pad_token_id, - fp16=self.args.fp16, - int8=self.args.int8, + compression_option=self.args.weight_format, + compression_ratio=self.args.ratio # **input_shapes, ) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 782aa0bc0d..54fe1193e5 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -18,7 +18,7 @@ from typing import Any, Callable, Dict, Optional, Union from requests.exceptions import ConnectionError as RequestsConnectionError -from transformers import AutoTokenizer +from transformers import AutoConfig, AutoTokenizer from optimum.exporters import TasksManager from optimum.exporters.onnx import __main__ as optimum_main @@ -26,11 +26,19 @@ from optimum.utils import DEFAULT_DUMMY_SHAPES from optimum.utils.save_utils import maybe_load_preprocessors, maybe_save_preprocessors -from ...intel.utils.import_utils import is_nncf_available -from ...intel.utils.modeling_utils import patch_decoder_attention_mask +from ...intel.utils.import_utils import is_nncf_available, is_optimum_version, is_transformers_version from .convert import export_models +if is_optimum_version(">=", "1.16.0"): + from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED +else: + # Copied from https://github.com/huggingface/optimum/blob/main/optimum/exporters/onnx/constants.py + SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED = [ + "bart", + "whisper", + ] + OV_XML_FILE_NAME = "openvino_model.xml" _MAX_UNCOMPRESSED_SIZE = 1e9 @@ -43,7 +51,6 @@ def main_export( output: Union[str, Path], task: str = "auto", device: str = "cpu", - fp16: Optional[bool] = False, framework: Optional[str] = None, cache_dir: Optional[str] = None, trust_remote_code: bool = False, @@ -56,7 +63,8 @@ def main_export( model_kwargs: Optional[Dict[str, Any]] = None, custom_onnx_configs: Optional[Dict[str, "OnnxConfig"]] = None, fn_get_submodels: Optional[Callable] = None, - int8: Optional[bool] = None, + compression_option: Optional[str] = None, + compression_ratio: Optional[float] = None, **kwargs_shapes, ): """ @@ -77,8 +85,6 @@ def main_export( use `xxx-with-past` to export the model using past key values in the decoder. device (`str`, defaults to `"cpu"`): The device to use to do the export. Defaults to "cpu". - fp16 (`Optional[bool]`, defaults to `"False"`): - Use half precision during the export. PyTorch-only, requires `device="cuda"`. framework (`Optional[str]`, defaults to `None`): The framework to use for the ONNX export (`"pt"` or `"tf"`). If not provided, will attempt to automatically detect the framework for the checkpoint. @@ -113,6 +119,11 @@ def main_export( fn_get_submodels (`Optional[Callable]`, defaults to `None`): Experimental usage: Override the default submodels that are used at the export. This is especially useful when exporting a custom architecture that needs to split the ONNX (e.g. encoder-decoder). If unspecified with custom models, optimum will try to use the default submodels used for the given task, with no guarantee of success. + compression_option (`Optional[str]`, defaults to `None`): + The weight compression option, e.g. `f16` stands for float16 weights, `i8` - INT8 weights, `int4_sym_g128` - INT4 symmetric weights w/ group size 128, `int4_asym_g128` - as previous but asymmetric w/ zero-point, + `int4_sym_g64` - INT4 symmetric weights w/ group size 64, "int4_asym_g64" - as previous but asymmetric w/ zero-point, `f32` - means no compression. + compression_ratio (`Optional[float]`, defaults to `None`): + Compression ratio between primary and backup precision (only relevant to INT4). **kwargs_shapes (`Dict`): Shapes to use during inference. This argument allows to override the default shapes used during the ONNX export. @@ -123,9 +134,14 @@ def main_export( >>> main_export("gpt2", output="gpt2_onnx/") ``` """ - if int8 and not is_nncf_available(): + if ( + compression_option is not None + and compression_option != "fp16" + and compression_option != "fp32" + and not is_nncf_available() + ): raise ImportError( - "Quantization of the weights to int8 requires nncf, please install it with `pip install nncf`" + f"Compression of the weights to {compression_option} requires nncf, please install it with `pip install nncf`" ) model_kwargs = model_kwargs or {} @@ -137,6 +153,43 @@ def main_export( original_task = task task = TasksManager.map_from_synonym(task) + # Patch the modules to export of GPTQ models w/o GPU + do_gptq_patching = False + try: + config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code) + model_type = config.model_type.replace("_", "-") + config_dict = config.to_dict() + quantization_config = config_dict.get("quantization_config", None) + do_gptq_patching = quantization_config and quantization_config["quant_method"] == "gptq" + except Exception: + model_type = None + pass + + if do_gptq_patching: + import torch + + torch.set_default_dtype(torch.float32) + orig_cuda_check = torch.cuda.is_available + torch.cuda.is_available = lambda: True + + from optimum.gptq import GPTQQuantizer + + orig_post_init_model = GPTQQuantizer.post_init_model + + def post_init_model(self, model): + from auto_gptq import exllama_set_max_input_length + + class StoreAttr(object): + pass + + model.quantize_config = StoreAttr() + model.quantize_config.desc_act = self.desc_act + if self.desc_act and not self.disable_exllama and self.max_input_length is not None: + model = exllama_set_max_input_length(model, self.max_input_length) + return model + + GPTQQuantizer.post_init_model = post_init_model + framework = TasksManager.determine_framework(model_name_or_path, subfolder=subfolder, framework=framework) # get the shapes to be used to generate dummy inputs @@ -158,6 +211,10 @@ def main_export( f"The task could not be automatically inferred as this is available only for models hosted on the Hugging Face Hub. Please provide the argument --task with the relevant task from {', '.join(TasksManager.get_all_tasks())}. Detailed error: {e}" ) + loading_kwargs = {} + if is_transformers_version(">=", "4.36") and model_type in SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED: + loading_kwargs["attn_implementation"] = "eager" + model = TasksManager.get_model_from_task( task, model_name_or_path, @@ -170,6 +227,7 @@ def main_export( trust_remote_code=trust_remote_code, framework=framework, device=device, + **loading_kwargs, ) custom_architecture = False @@ -222,31 +280,24 @@ def main_export( preprocessors = maybe_load_preprocessors( model_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code ) - if not task.startswith("text-generation"): - onnx_config, models_and_onnx_configs = optimum_main._get_submodels_and_onnx_configs( - model=model, - task=task, - monolith=False, - custom_onnx_configs=custom_onnx_configs if custom_onnx_configs is not None else {}, - custom_architecture=custom_architecture, - fn_get_submodels=fn_get_submodels, - preprocessors=preprocessors, - _variant="default", - ) - else: - # TODO : ModelPatcher will be added in next optimum release - model = patch_decoder_attention_mask(model) - onnx_config_constructor = TasksManager.get_exporter_config_constructor(model=model, exporter="onnx", task=task) - onnx_config = onnx_config_constructor(model.config) - models_and_onnx_configs = {"model": (model, onnx_config)} + onnx_config, models_and_onnx_configs = optimum_main._get_submodels_and_onnx_configs( + model=model, + task=task, + monolith=False, + custom_onnx_configs=custom_onnx_configs if custom_onnx_configs is not None else {}, + custom_architecture=custom_architecture, + fn_get_submodels=fn_get_submodels, + preprocessors=preprocessors, + _variant="default", + legacy=False, + ) - if int8 is None: - int8 = False + if compression_option is None: num_parameters = model.num_parameters() if not is_stable_diffusion else model.unet.num_parameters() if num_parameters >= _MAX_UNCOMPRESSED_SIZE: if is_nncf_available(): - int8 = True + compression_option = "int8" logger.info("The model weights will be quantized to int8.") else: logger.warning( @@ -320,7 +371,12 @@ def main_export( output_names=files_subpaths, input_shapes=input_shapes, device=device, - fp16=fp16, - int8=int8, + compression_option=compression_option, + compression_ratio=compression_ratio, model_kwargs=model_kwargs, ) + + # Unpatch modules after GPTQ export + if do_gptq_patching: + torch.cuda.is_available = orig_cuda_check + GPTQQuantizer.post_init_model = orig_post_init_model diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 14636f1f77..56c5a10e5d 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -28,9 +28,10 @@ from optimum.exporters.onnx.convert import check_dummy_inputs_are_allowed from optimum.exporters.onnx.convert import export_pytorch as export_pytorch_to_onnx from optimum.exporters.onnx.convert import export_tensorflow as export_tensorflow_onnx +from optimum.exporters.onnx.model_patcher import DecoderModelPatcher from optimum.utils import is_diffusers_available -from ...intel.utils.import_utils import is_nncf_available +from ...intel.utils.import_utils import is_nncf_available, is_optimum_version from .utils import ( OV_XML_FILE_NAME, clear_class_registry, @@ -53,8 +54,8 @@ from transformers.modeling_tf_utils import TFPreTrainedModel -def _save_model(model, path: str, compress_to_fp16=False, load_in_8bit=False): - if load_in_8bit: +def _save_model(model, path: str, compression_option: Optional[str] = None, compression_ratio: Optional[float] = None): + if compression_option is not None and compression_option != "fp16" and compression_option != "fp32": if not is_nncf_available(): raise ImportError( "Quantization of the weights to int8 requires nncf, please install it with `pip install nncf`" @@ -62,7 +63,32 @@ def _save_model(model, path: str, compress_to_fp16=False, load_in_8bit=False): import nncf - model = nncf.compress_weights(model) + COMPRESSION_OPTIONS = { + "int8": {"mode": nncf.CompressWeightsMode.INT8}, + "int4_sym_g128": { + "mode": nncf.CompressWeightsMode.INT4_SYM, + "group_size": 128, + "ratio": compression_ratio, + }, + "int4_asym_g128": { + "mode": nncf.CompressWeightsMode.INT4_ASYM, + "group_size": 128, + "ratio": compression_ratio, + }, + "int4_sym_g64": { + "mode": nncf.CompressWeightsMode.INT4_SYM, + "group_size": 64, + "ratio": compression_ratio, + }, + "int4_asym_g64": { + "mode": nncf.CompressWeightsMode.INT4_ASYM, + "group_size": 64, + "ratio": compression_ratio, + }, + } + model = nncf.compress_weights(model, **COMPRESSION_OPTIONS[compression_option]) + + compress_to_fp16 = compression_option == "fp16" save_model(model, path, compress_to_fp16) @@ -74,8 +100,8 @@ def export( device: str = "cpu", input_shapes: Optional[Dict] = None, model_kwargs: Optional[Dict[str, Any]] = None, - fp16: bool = False, - int8: bool = False, + compression_option: Optional[str] = None, + compression_ratio: Optional[float] = None, ) -> Tuple[List[str], List[str]]: """ Exports a Pytorch or TensorFlow model to an OpenVINO Intermediate Representation. @@ -92,6 +118,11 @@ def export( device (`str`, *optional*, defaults to `cpu`): The device on which the model will be exported. Either `cpu` or `cuda`. Only PyTorch is supported for export on CUDA devices. + compression_option (`Optional[str]`, defaults to `None`): + The weight compression option, e.g. `f16` stands for float16 weights, `i8` - INT8 weights, `int4_sym_g128` - INT4 symmetric weights w/ group size 128, `int4_asym_g128` - as previous but asymmetric w/ zero-point, + `int4_sym_g64` - INT4 symmetric weights w/ group size 64, "int4_asym_g64" - as previous but asymmetric w/ zero-point. + compression_ratio (`Optional[float]`, defaults to `None`): + Compression ratio between primary and backup precision (only relevant to INT4). input_shapes (`Optional[Dict]`, defaults to `None`): If specified, allows to use specific shapes for the example input provided to the exporter. @@ -116,9 +147,9 @@ def export( output, device=device, input_shapes=input_shapes, + compression_option=compression_option, + compression_ratio=compression_ratio, model_kwargs=model_kwargs, - fp16=fp16, - int8=int8, ) elif is_tf_available() and issubclass(type(model), TFPreTrainedModel): @@ -142,6 +173,8 @@ def export_tensorflow( config: OnnxConfig, opset: int, output: Path, + compression_option: Optional[str] = None, + compression_ratio: Optional[float] = None, ): """ Export the TensorFlow model to OpenVINO format. @@ -160,7 +193,9 @@ def export_tensorflow( onnx_path = Path(output).with_suffix(".onnx") input_names, output_names = export_tensorflow_onnx(model, config, opset, onnx_path) ov_model = convert_model(str(onnx_path)) - _save_model(ov_model, output.parent / output, compress_to_fp16=False, load_in_8bit=False) + _save_model( + ov_model, output.parent / output, compression_option=compression_option, compression_ratio=compression_ratio + ) return input_names, output_names, True @@ -172,8 +207,8 @@ def export_pytorch_via_onnx( device: str = "cpu", input_shapes: Optional[Dict] = None, model_kwargs: Optional[Dict[str, Any]] = None, - fp16: bool = False, - int8: bool = False, + compression_option: Optional[str] = None, + compression_ratio: Optional[float] = None, ): """ Exports a PyTorch model to an OpenVINO Intermediate Representation via ONNX export. @@ -193,7 +228,12 @@ def export_pytorch_via_onnx( input_shapes (`optional[Dict]`, defaults to `None`): If specified, allows to use specific shapes for the example input provided to the exporter. model_kwargs (optional[Dict[str, Any]], defaults to `None`): - Additional kwargs for model export + Additional kwargs for model export. + compression_option (`Optional[str]`, defaults to `None`): + The weight compression option, e.g. `f16` stands for float16 weights, `i8` - INT8 weights, `int4_sym_g128` - INT4 symmetric weights w/ group size 128, `int4_asym_g128` - as previous but asymmetric w/ zero-point, + `int4_sym_g64` - INT4 symmetric weights w/ group size 64, "int4_asym_g64" - as previous but asymmetric w/ zero-point. + compression_ratio (`Optional[float]`, defaults to `None`): + Compression ratio between primary and backup precision (only relevant to INT4). Returns: `Tuple[List[str], List[str], bool]`: A tuple with an ordered list of the model's inputs, and the named inputs from @@ -215,8 +255,8 @@ def export_pytorch_via_onnx( _save_model( ov_model, output.parent / OV_XML_FILE_NAME if output.suffix != ".xml" else output, - compress_to_fp16=fp16, - load_in_8bit=int8, + compression_option=compression_option, + compression_ratio=compression_ratio, ) return input_names, output_names, True @@ -229,8 +269,8 @@ def export_pytorch( device: str = "cpu", input_shapes: Optional[Dict] = None, model_kwargs: Optional[Dict[str, Any]] = None, - fp16: bool = False, - int8: bool = False, + compression_option: Optional[str] = None, + compression_ratio: Optional[float] = None, ) -> Tuple[List[str], List[str]]: """ Exports a PyTorch model to an OpenVINO Intermediate Representation. @@ -297,6 +337,8 @@ def export_pytorch( dummy_inputs, dict_inputs = remove_none_from_dummy_inputs(dummy_inputs) input_info = get_input_shapes(dummy_inputs, inputs) custom_patcher = type(config).patch_model_for_export != OnnxConfig.patch_model_for_export + patch_model_forward = False + orig_forward = model.forward try: # TorchScript used behind OpenVINO conversion. Optimum supports only return_dict=True models for patching, # while TorchScript do not support dictionary with values of mixed types (e.g. Tensor and None) in model input/output @@ -304,7 +346,14 @@ def export_pytorch( # model.config.torchscript = True can not be used for patching, because it overrides return_dict to Flase if custom_patcher or dict_inputs: patcher = config.patch_model_for_export(model, model_kwargs=model_kwargs) - patched_forward = patcher.patched_forward + # DecoderModelPatcher does not override model forward in optimum < 1.15 + if ( + isinstance(patcher, DecoderModelPatcher) and is_optimum_version("<", "1.15.0") + ) or patcher.orig_forward_name != "forward": + patch_model_forward = True + patched_forward = model.forward + else: + patched_forward = patcher.patched_forward @functools.wraps(patched_forward) def ts_patched_forward(*args, **kwargs): @@ -317,16 +366,30 @@ def ts_patched_forward(*args, **kwargs): outputs = patched_forward(*args, **kwargs) return tuple(outputs.values()) - patcher.patched_forward = ts_patched_forward + if not patch_model_forward: + patcher.patched_forward = ts_patched_forward + else: + model.forward = ts_patched_forward with patcher: ov_model = convert_model(model, example_input=dummy_inputs, input=input_info) else: model.config.torchscript = True + model.config.retun_dict = False ov_model = convert_model(model, example_input=dummy_inputs, input=input_info) except Exception as ex: logger.warning(f"Export model to OpenVINO directly failed with: \n{ex}.\nModel will be exported to ONNX") + if patch_model_forward: + model.forward = orig_forward return export_pytorch_via_onnx( - model, config, opset, output, device, input_shapes, model_kwargs, fp16=fp16, int8=int8 + model, + config, + opset, + output, + device, + input_shapes, + model_kwargs, + compression_option=compression_option, + compression_ratio=compression_ratio, ) ordered_dummy_inputs = {param: dummy_inputs[param] for param in sig.parameters if param in dummy_inputs} ordered_input_names = list(inputs) @@ -348,7 +411,7 @@ def ts_patched_forward(*args, **kwargs): inp_tensor.get_node().set_partial_shape(static_shape) inp_tensor.get_node().set_element_type(get_element_type(inp_data.cpu().numpy().dtype)) ov_model.validate_nodes_and_infer_types() - _save_model(ov_model, output, compress_to_fp16=fp16, load_in_8bit=int8) + _save_model(ov_model, output, compression_option=compression_option, compression_ratio=compression_ratio) clear_class_registry() del model gc.collect() @@ -365,8 +428,8 @@ def export_models( device: str = "cpu", input_shapes: Optional[Dict] = None, model_kwargs: Optional[Dict[str, Any]] = None, - fp16: bool = False, - int8: bool = False, + compression_option: Optional[str] = None, + compression_ratio: Optional[int] = None, ) -> Tuple[List[List[str]], List[List[str]]]: """ Export the models to OpenVINO IR format @@ -381,8 +444,13 @@ def export_models( export on CUDA devices. input_shapes (Optional[Dict], optional, Defaults to None): If specified, allows to use specific shapes for the example input provided to the exporter. + compression_option (`Optional[str]`, defaults to `None`): + The weight compression option, e.g. `f16` stands for float16 weights, `i8` - INT8 weights, `int4_sym_g128` - INT4 symmetric weights w/ group size 128, `int4_asym_g128` - as previous but asymmetric w/ zero-point, + `int4_sym_g64` - INT4 symmetric weights w/ group size 64, "int4_asym_g64" - as previous but asymmetric w/ zero-point. + compression_ratio (`Optional[int]`, defaults to `None`): + Compression ratio between primary and backup precision (only relevant to INT4). model_kwargs (Optional[Dict[str, Any]], optional): - Additional kwargs for model export + Additional kwargs for model export. Raises: ValueError: if custom names set not equal of number of models @@ -411,8 +479,8 @@ def export_models( device=device, input_shapes=input_shapes, model_kwargs=model_kwargs, - fp16=fp16, - int8=int8, + compression_option=compression_option, + compression_ratio=compression_ratio, ) ) diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index dcd2827eab..570a451bd8 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -62,6 +62,7 @@ "OVStableDiffusionInpaintPipeline", "OVStableDiffusionXLPipeline", "OVStableDiffusionXLImg2ImgPipeline", + "OVLatentConsistencyModelPipeline", ] else: _import_structure["openvino"].extend( @@ -71,6 +72,7 @@ "OVStableDiffusionInpaintPipeline", "OVStableDiffusionXLPipeline", "OVStableDiffusionXLImg2ImgPipeline", + "OVLatentConsistencyModelPipeline", ] ) @@ -97,6 +99,7 @@ "OVModelForPix2Struct", "OVModelForQuestionAnswering", "OVModelForSeq2SeqLM", + "OVModelForSpeechSeq2Seq", "OVModelForSequenceClassification", "OVModelForTokenClassification", ] @@ -158,6 +161,7 @@ raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: from .utils.dummy_openvino_and_diffusers_objects import ( + OVLatentConsistencyModelPipeline, OVStableDiffusionImg2ImgPipeline, OVStableDiffusionInpaintPipeline, OVStableDiffusionPipeline, @@ -166,6 +170,7 @@ ) else: from .openvino import ( + OVLatentConsistencyModelPipeline, OVStableDiffusionImg2ImgPipeline, OVStableDiffusionInpaintPipeline, OVStableDiffusionPipeline, @@ -191,6 +196,7 @@ OVModelForQuestionAnswering, OVModelForSeq2SeqLM, OVModelForSequenceClassification, + OVModelForSpeechSeq2Seq, OVModelForTokenClassification, ) diff --git a/optimum/intel/generation/modeling.py b/optimum/intel/generation/modeling.py index b4c41e0be1..fd946ea607 100644 --- a/optimum/intel/generation/modeling.py +++ b/optimum/intel/generation/modeling.py @@ -26,12 +26,13 @@ from transformers.utils import WEIGHTS_NAME from optimum.exporters import TasksManager +from optimum.exporters.onnx import MODEL_TYPES_REQUIRING_POSITION_IDS from optimum.modeling_base import OptimizedModel from optimum.utils import NormalizedConfigManager from ..utils.constant import _TASK_ALIASES from ..utils.import_utils import is_torch_version, is_transformers_version -from ..utils.modeling_utils import _prepare_attn_mask, _prepare_decoder_attention_mask +from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS, patch_decoder_attention_mask if is_transformers_version("<", "4.25.0"): @@ -43,47 +44,45 @@ logger = logging.getLogger(__name__) +def get_float_type(model_dtype: torch.dtype): + if model_dtype == torch.bfloat16: + return "bf16" + elif model_dtype == torch.float16: + return "fp16" + else: + return "fp32" + + def prepare_jit_inputs(model: PreTrainedModel, task: str, use_cache: bool = False): task = _TASK_ALIASES.get(task, task) signature = inspect.signature(model.forward) if hasattr(model, "forward") else inspect.signature(model.__call__) onnx_config_class = TasksManager.get_exporter_config_constructor(model=model, exporter="onnx", task=task) - onnx_config = onnx_config_class(model.config) - if task == "text-generation" and use_cache: - onnx_config = onnx_config_class(model.config, use_past=True, use_past_in_inputs=True) + float_dtype = get_float_type(model.dtype) + if "text-generation" in task: + onnx_config = onnx_config_class( + model.config, use_past=use_cache, use_past_in_inputs=use_cache, float_dtype=float_dtype + ) + else: + onnx_config = onnx_config_class(model.config) + dummy_inputs = onnx_config.generate_dummy_inputs(framework="pt") - model_inputs = {key: dummy_inputs[key] for key in signature.parameters if dummy_inputs.get(key, None) is not None} - if task == "text-generation" and use_cache and model.config.model_type != "gpt_bigcode": - # WA jit.trace issue of model like llama in https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L464, or else, generation output will be incorrect - pkv = [] - for i in range(len(model_inputs["past_key_values"])): - pkv.append([]) - for j in range(len(model_inputs["past_key_values"][0])): - pkv[i].append(model_inputs["past_key_values"][i][j].to(model.dtype)) - pkv[i] = tuple(pkv[i]) - model_inputs["past_key_values"] = tuple(pkv) - i = model_inputs["input_ids"] - a = model_inputs["attention_mask"] - model_inputs["input_ids"] = torch.cat([torch.zeros(i.shape[0], 1), i], -1).to(i.dtype) - model_inputs["attention_mask"] = torch.cat([torch.zeros(a.shape[0], 1), a], -1).to(a.dtype) - return model_inputs + + return {key: dummy_inputs[key] for key in signature.parameters if dummy_inputs.get(key, None) is not None} def jit_trace(model: PreTrainedModel, task: str, use_cache: bool = False): model_inputs = prepare_jit_inputs(model, task, use_cache) # check if the model_inputs is correct. model(**model_inputs) + torch._C._jit_set_texpr_fuser_enabled(False) if "past_key_values" in model_inputs.keys(): model.config.return_dict = False - if is_torch_version(">", "2.0.1"): - traced_model = torch.jit.trace(model, example_kwarg_inputs=model_inputs, strict=False) - else: - traced_model = torch.jit.trace(model, example_inputs=tuple(model_inputs.values()), strict=False) + if is_torch_version(">=", "2.1.0"): + traced_model = torch.jit.trace(model, example_kwarg_inputs=model_inputs, strict=False) else: - if is_torch_version(">=", "2.0.0"): - traced_model = torch.jit.trace(model, example_kwarg_inputs=model_inputs, strict=False) - else: - traced_model = torch.jit.trace(model, example_inputs=tuple(model_inputs.values()), strict=False) + traced_model = torch.jit.trace(model, example_inputs=tuple(model_inputs.values()), strict=False) + traced_model = torch.jit.freeze(traced_model.eval()) traced_model(**model_inputs) traced_model(**model_inputs) @@ -91,11 +90,7 @@ def jit_trace(model: PreTrainedModel, task: str, use_cache: bool = False): return traced_model -class PreTrainedModel(OptimizedModel): - pass - - -class BaseModelForCausalLM(PreTrainedModel, GenerationMixin): +class BaseModelForCausalLM(OptimizedModel, GenerationMixin): auto_model_class = AutoModelForCausalLM export_feature = "text-generation" main_input_name = "input_ids" @@ -156,12 +151,23 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg if past_key_values[0][0].shape[0] == input_ids.shape[0]: past_key_values = self._convert_to_bloom_cache(past_key_values) + position_ids = kwargs.get("position_ids", None) + + attention_mask = kwargs.get("attention_mask", None) + + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -1].unsqueeze(-1) + return { "input_ids": input_ids, "past_key_values": past_key_values, "use_cache": self.use_cache, - "position_ids": None, - "attention_mask": kwargs.get("attention_mask", None), + "position_ids": position_ids, + "attention_mask": attention_mask, "token_type_ids": None, } @@ -258,6 +264,7 @@ def forward( input_ids: torch.LongTensor = None, attention_mask: Optional[torch.FloatTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + position_ids: Optional[torch.FloatTensor] = None, **kwargs, ) -> CausalLMOutputWithPast: if attention_mask is None: @@ -268,43 +275,42 @@ def forward( "attention_mask": attention_mask, } + model_type = self.config.model_type.replace("_", "-") + if self.use_cache: if past_key_values is None: nb_pkv = 2 num_layers = self.normalized_config.num_layers - num_attention_heads = self.normalized_config.num_attention_heads - num_key_value_heads = num_attention_heads - if hasattr(self.normalized_config, "num_key_value_heads"): - num_key_value_heads = self.normalized_config.num_key_value_heads - hidden_size = self.normalized_config.hidden_size - d_k = hidden_size // num_attention_heads - if self.config.model_type == "gpt_bigcode": - new_shape = [input_ids.shape[0], 0, d_k * 2] - empty_tensor = torch.empty(size=new_shape) - if self.model_dtype is not None: - empty_tensor = empty_tensor.to(self.model_dtype) - past_key_values = tuple([empty_tensor] * num_layers) - elif self.config.model_type != "bloom": - new_shape = [input_ids.shape[0], num_key_value_heads, 0, d_k] - empty_tensor = torch.empty(size=new_shape) - if self.model_dtype is not None: - empty_tensor = empty_tensor.to(self.model_dtype) - pkv = tuple(empty_tensor for _ in range(nb_pkv)) + d_k = self.normalized_config.hidden_size // self.normalized_config.num_attention_heads + batch_size = input_ids.shape[0] + + if model_type in {"mistral", "llama"}: + num_attention_heads = self.normalized_config.num_key_value_heads + else: + num_attention_heads = self.normalized_config.num_attention_heads + + if model_type == "bloom": + shape_key = (batch_size * num_attention_heads, d_k, 0) + shape_value = (batch_size * num_attention_heads, 0, d_k) + key = torch.empty(size=shape_key, dtype=self.model_dtype, device=self._device) + value = torch.empty(size=shape_value, dtype=self.model_dtype, device=self._device) + past_key_values = tuple( + tuple(key if idx % 2 == 0 else value for idx in range(nb_pkv)) for _ in range(num_layers) + ) + elif model_type.replace("-", "_") in MULTI_QUERY_ATTN_MODELS: + shape = (batch_size, 0, d_k * 2) + pkv = torch.empty(size=shape, dtype=self.model_dtype, device=self._device) + past_key_values = tuple(pkv for _ in range(num_layers)) else: - pkv = () - for nb_pkv in range(nb_pkv): - if nb_pkv % 2 == 0: - new_shape = [input_ids.shape[0] * num_key_value_heads, d_k, 0] - else: - new_shape = [input_ids.shape[0] * num_key_value_heads, 0, d_k] - empty_tensor = torch.empty(size=new_shape) - if self.model_dtype is not None: - empty_tensor = empty_tensor.to(self.model_dtype) - pkv = pkv + (empty_tensor,) - if past_key_values is None: - past_key_values = tuple(tuple(pkv) for _ in range(num_layers)) + shape = (batch_size, num_attention_heads, 0, d_k) + pkv = torch.empty(size=shape, dtype=self.model_dtype, device=self._device) + past_key_values = tuple(tuple(pkv for _ in range(nb_pkv)) for _ in range(num_layers)) inputs["past_key_values"] = past_key_values + + if position_ids is not None and model_type in MODEL_TYPES_REQUIRING_POSITION_IDS: + inputs["position_ids"] = position_ids + outputs = self.model(**inputs) if isinstance(outputs, (list, tuple)): @@ -389,7 +395,7 @@ def _from_transformers( torch_dtype: Optional[Union[str, "torch.dtype"]] = None, **kwargs, ): - if is_torch_version("<", "2.0.0"): + if is_torch_version("<", "2.1.0"): raise ImportError("`torch>=2.0.0` is needed to trace your model") task = cls.export_feature @@ -405,12 +411,7 @@ def _from_transformers( } model = TasksManager.get_model_from_task(task, model_id, **model_kwargs) - - if model.config.model_type == "bloom": - model.transformer._prepare_attn_mask = _prepare_attn_mask - - if model.config.model_type == "llama": - model.model._prepare_decoder_attention_mask = _prepare_decoder_attention_mask + model = patch_decoder_attention_mask(model) traced_model = jit_trace(model, task, use_cache) save_dir = TemporaryDirectory() diff --git a/optimum/intel/neural_compressor/modeling_base.py b/optimum/intel/neural_compressor/modeling_base.py index e6ae0f2595..5cd224146a 100644 --- a/optimum/intel/neural_compressor/modeling_base.py +++ b/optimum/intel/neural_compressor/modeling_base.py @@ -164,7 +164,7 @@ def _from_pretrained( if q_config is None: model = model_class.from_pretrained(model_save_dir) else: - init_contexts = [no_init_weights(_enable=True)] + init_contexts = [no_init_weights(_enable=False)] with ContextManagers(init_contexts): model = model_class(config) try: diff --git a/optimum/intel/neural_compressor/quantization.py b/optimum/intel/neural_compressor/quantization.py index 03eccbd1cb..f2d95cbc53 100644 --- a/optimum/intel/neural_compressor/quantization.py +++ b/optimum/intel/neural_compressor/quantization.py @@ -15,6 +15,7 @@ import copy import inspect import logging +import warnings from enum import Enum from itertools import chain from pathlib import Path @@ -30,16 +31,25 @@ from neural_compressor.quantization import fit from torch.utils.data import DataLoader, RandomSampler from transformers import ( + AutoModelForCausalLM, + AutoModelForMaskedLM, + AutoModelForMultipleChoice, + AutoModelForQuestionAnswering, + AutoModelForSeq2SeqLM, + AutoModelForSequenceClassification, + AutoModelForTokenClassification, + AutoModelForVision2Seq, DataCollator, PretrainedConfig, PreTrainedModel, + XLNetLMHeadModel, default_data_collator, ) from optimum.exporters import TasksManager from optimum.exporters.onnx import OnnxConfig from optimum.onnxruntime import ORTModel -from optimum.onnxruntime.modeling_decoder import ORTModelDecoder +from optimum.onnxruntime.modeling_decoder import ORTModelForCausalLM from optimum.onnxruntime.modeling_seq2seq import ORTModelForConditionalGeneration from optimum.onnxruntime.utils import ONNX_DECODER_NAME from optimum.quantization_base import OptimumQuantizer @@ -273,7 +283,7 @@ def quantize( if isinstance(self._original_model, ORTModelForConditionalGeneration): raise RuntimeError("ORTModelForConditionalGeneration not supported for quantization") - if isinstance(self._original_model, ORTModelDecoder): + if isinstance(self._original_model, ORTModelForCausalLM): model_or_path = self._original_model.onnx_paths if len(model_or_path) > 1: raise RuntimeError( @@ -299,6 +309,7 @@ def quantize( "The maximum number of trials specified has been reached and no quantized model meeting the specified" " accuracy tolerance has been found. Either the tolerance or the number of trials need to be increased." ) + if isinstance(self._original_model.config, PretrainedConfig): # If backend is IPEX, then the quantized model is JIT model which will drop the config attribute, # so need set config from original_model. @@ -545,3 +556,49 @@ def _apply_quantization_from_config(q_config: Dict, model: torch.nn.Module) -> t q_model = convert(q_model, mapping=q_mapping, inplace=True) return q_model + + +class IncQuantizedModel(INCModel): + @classmethod + def from_pretrained(cls, *args, **kwargs): + warnings.warn( + f"The class `{cls.__name__}` has been depreciated and will be removed in optimum-intel v1.12, please use " + f"`{cls.__name__.replace('IncQuantized', 'INC')}` instead." + ) + return super().from_pretrained(*args, **kwargs) + + +class IncQuantizedModelForQuestionAnswering(IncQuantizedModel): + auto_model_class = AutoModelForQuestionAnswering + + +class IncQuantizedModelForSequenceClassification(IncQuantizedModel): + auto_model_class = AutoModelForSequenceClassification + + +class IncQuantizedModelForTokenClassification(IncQuantizedModel): + auto_model_class = AutoModelForTokenClassification + + +class IncQuantizedModelForMultipleChoice(IncQuantizedModel): + auto_model_class = AutoModelForMultipleChoice + + +class IncQuantizedModelForSeq2SeqLM(IncQuantizedModel): + auto_model_class = AutoModelForSeq2SeqLM + + +class IncQuantizedModelForCausalLM(IncQuantizedModel): + auto_model_class = AutoModelForCausalLM + + +class IncQuantizedModelForMaskedLM(IncQuantizedModel): + auto_model_class = AutoModelForMaskedLM + + +class IncQuantizedModelForXLNetLM(IncQuantizedModel): + auto_model_class = XLNetLMHeadModel + + +class IncQuantizedModelForVision2Seq(IncQuantizedModel): + auto_model_class = AutoModelForVision2Seq diff --git a/optimum/intel/neural_compressor/trainer.py b/optimum/intel/neural_compressor/trainer.py index 8e8fec1758..4360c5abfe 100644 --- a/optimum/intel/neural_compressor/trainer.py +++ b/optimum/intel/neural_compressor/trainer.py @@ -15,12 +15,21 @@ import copy import math import os +import shutil import sys import time from collections.abc import Mapping from itertools import chain from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union + +# Integrations must be imported before ML frameworks: +# isort: off +from transformers.integrations import hp_params +from transformers.integrations.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_available + +# isort: on + import datasets import torch import torch.distributed as dist @@ -28,52 +37,68 @@ from neural_compressor.compression import DistillationCallbacks from neural_compressor.conf.pythonic_config import _BaseQuantizationConfig from neural_compressor.experimental.export import torch_to_fp32_onnx, torch_to_int8_onnx - -# from packaging import version +from packaging import version from torch import nn from torch.utils.data import Dataset, RandomSampler -from torch.utils.data.dataloader import DataLoader -from torch.utils.data.distributed import DistributedSampler -from tqdm.auto import tqdm from transformers import Trainer from transformers.data.data_collator import DataCollator from transformers.debug_utils import DebugOption, DebugUnderflowOverflow -from transformers.deepspeed import deepspeed_init -from transformers.file_utils import WEIGHTS_NAME - -# Integrations must be imported before ML frameworks: -from transformers.integrations import hp_params from transformers.modeling_utils import PreTrainedModel, get_parameter_dtype, unwrap_model from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from transformers.pytorch_utils import is_torch_less_than_1_11 from transformers.tokenization_utils_base import PreTrainedTokenizerBase from transformers.trainer import TRAINER_STATE_NAME from transformers.trainer_callback import TrainerCallback, TrainerState -from transformers.trainer_pt_utils import IterableDatasetShard +from transformers.trainer_pt_utils import get_dataloader_sampler, get_model_param_count from transformers.trainer_utils import ( EvalPrediction, HPSearchBackend, - ShardedDDPOption, TrainOutput, has_length, speed_metrics, ) -from transformers.training_args import TrainingArguments -from transformers.utils import is_apex_available, is_sagemaker_mp_enabled, logging +from transformers.training_args import ParallelMode, TrainingArguments +from transformers.utils import ( + WEIGHTS_NAME, + is_accelerate_available, + is_apex_available, + is_sagemaker_mp_enabled, + is_torch_tpu_available, + logging, +) from optimum.exporters import TasksManager from ..utils.constant import _TASK_ALIASES, MIN_QDQ_ONNX_OPSET, ONNX_WEIGHTS_NAME, TRAINING_ARGS_NAME -from ..utils.import_utils import is_neural_compressor_version +from ..utils.import_utils import is_neural_compressor_version, is_transformers_version from .configuration import INCConfig +if is_accelerate_available(): + from accelerate import __version__ as accelerate_version + from accelerate import skip_first_batches + + if version.parse(accelerate_version) > version.parse("0.20.3"): + pass + DATA_SAMPLERS = [RandomSampler] + if version.parse(accelerate_version) > version.parse("0.23.0"): + from accelerate.data_loader import SeedableRandomSampler + + DATA_SAMPLERS += [SeedableRandomSampler] + + if is_deepspeed_available(): + pass + + if is_apex_available(): from apex import amp if is_sagemaker_mp_enabled(): import smdistributed.modelparallel.torch as smp +if is_torch_tpu_available(check_device=False): + import torch_xla.core.xla_model as xm + if TYPE_CHECKING: from optimum.exporters.onnx import OnnxConfig @@ -109,6 +134,8 @@ def __init__( task: Optional[str] = None, save_onnx_model: bool = False, ): + self.neftune_noise_alpha = None + super().__init__( model, args, @@ -178,7 +205,12 @@ def __init__( def _inner_training_loop( self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None ): + self.accelerator.free_memory() self._train_batch_size = batch_size + + if self.args.auto_find_batch_size: + self.state.train_batch_size = self._train_batch_size + logger.debug(f"Currently training with a batch size of: {self._train_batch_size}") # Data loader and number of training steps train_dataloader = self.get_train_dataloader() @@ -186,9 +218,10 @@ def _inner_training_loop( # number of training epochs: num_train_epochs # number of training steps per epoch: num_update_steps_per_epoch # total number of training steps to execute: max_steps - total_train_batch_size = args.train_batch_size * args.gradient_accumulation_steps * args.world_size + total_train_batch_size = self._train_batch_size * args.gradient_accumulation_steps * args.world_size len_dataloader = None + num_train_tokens = None if has_length(train_dataloader): len_dataloader = len(train_dataloader) num_update_steps_per_epoch = len_dataloader // args.gradient_accumulation_steps @@ -230,58 +263,110 @@ def _inner_training_loop( else: debug_overflow = DebugUnderflowOverflow(self.model) # noqa - delay_optimizer_creation = ( - self.sharded_ddp is not None - and self.sharded_ddp != ShardedDDPOption.SIMPLE - or is_sagemaker_mp_enabled() - or self.fsdp is not None + is_fsdp_xla_enabled = ( + self.is_fsdp_xla_enabled if is_transformers_version(">=", "4.36.0") else self.fsdp is not None ) - if args.deepspeed: - deepspeed_engine, optimizer, lr_scheduler = deepspeed_init( - self, num_training_steps=max_steps, resume_from_checkpoint=resume_from_checkpoint - ) - self.model = deepspeed_engine.module - self.model_wrapped = deepspeed_engine - self.deepspeed = deepspeed_engine - self.optimizer = optimizer - self.lr_scheduler = lr_scheduler - elif not delay_optimizer_creation: + delay_optimizer_creation = is_sagemaker_mp_enabled() or is_fsdp_xla_enabled or self.is_fsdp_enabled + + if self.is_deepspeed_enabled: + self.optimizer, self.lr_scheduler = deepspeed_init(self, num_training_steps=max_steps) + + if not delay_optimizer_creation: self.create_optimizer_and_scheduler(num_training_steps=max_steps) self.state = TrainerState() self.state.is_hyper_param_search = trial is not None + self.state.train_batch_size = self._train_batch_size + + # Compute absolute values for logging, eval, and save if given as ratio + if args.logging_steps is not None: + if args.logging_steps < 1: + self.state.logging_steps = math.ceil(max_steps * args.logging_steps) + else: + self.state.logging_steps = args.logging_steps + if args.eval_steps is not None: + if args.eval_steps < 1: + self.state.eval_steps = math.ceil(max_steps * args.eval_steps) + else: + self.state.eval_steps = args.eval_steps + if args.save_steps is not None: + if args.save_steps < 1: + self.state.save_steps = math.ceil(max_steps * args.save_steps) + else: + self.state.save_steps = args.save_steps # Activate gradient checkpointing if needed if args.gradient_checkpointing: - self.model.gradient_checkpointing_enable() + if args.gradient_checkpointing_kwargs is None: + gradient_checkpointing_kwargs = {} + else: + gradient_checkpointing_kwargs = args.gradient_checkpointing_kwargs + + self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs) model = self._wrap_model(self.model_wrapped) - if is_sagemaker_mp_enabled() and resume_from_checkpoint is not None: - self._load_from_checkpoint(resume_from_checkpoint, model) + # as the model is wrapped, don't use `accelerator.prepare` + # this is for unhandled cases such as + # FSDP-XLA, SageMaker MP/DP, DataParallel, IPEX + use_accelerator_prepare = True if model is self.model else False + + if delay_optimizer_creation: + if is_transformers_version("<", "4.36.0") and use_accelerator_prepare: + self.model = self.accelerator.prepare(self.model) + self.create_optimizer_and_scheduler(num_training_steps=max_steps) + + # prepare using `accelerator` prepare + if use_accelerator_prepare: + self.model.train() + if hasattr(self.lr_scheduler, "step"): + if self.use_apex: + model = self.accelerator.prepare(self.model) + else: + model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer) + else: + # to handle cases wherein we pass "DummyScheduler" such as when it is specified in DeepSpeed config. + model, self.optimizer, self.lr_scheduler = self.accelerator.prepare( + self.model, self.optimizer, self.lr_scheduler + ) + + if self.is_fsdp_enabled: + self.model = self.model_wrapped = model # for the rest of this function `model` is the outside model, whether it was wrapped or not if model is not self.model: self.model_wrapped = model - if delay_optimizer_creation: - self.create_optimizer_and_scheduler(num_training_steps=max_steps) + # backward compatibility + if self.is_deepspeed_enabled: + self.deepspeed = self.model_wrapped + + # ckpt loading + if resume_from_checkpoint is not None: + if self.is_deepspeed_enabled: + deepspeed_load_checkpoint(self.model_wrapped, resume_from_checkpoint) + elif is_sagemaker_mp_enabled() or self.is_fsdp_enabled: + self._load_from_checkpoint(resume_from_checkpoint, self.model_wrapped) # Check if saved optimizer or scheduler states exist self._load_optimizer_and_scheduler(resume_from_checkpoint) # important: at this point: # self.model is the Transformers Model - # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model), etc. + # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model), + # FSDP(Transformers Model), Dynamo Optimized Module(Transformers Model) etc. # Train! logger.info("***** Running training *****") - logger.info(f" Num examples = {num_examples}") - logger.info(f" Num Epochs = {num_train_epochs}") - logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}") - logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size}") + logger.info(f" Num examples = {num_examples:,}") + logger.info(f" Num Epochs = {num_train_epochs:,}") + logger.info(f" Instantaneous batch size per device = {self.args.per_device_train_batch_size:,}") + if self.args.per_device_train_batch_size != self._train_batch_size: + logger.info(f" Training with DataParallel so batch size has been adjusted to: {self._train_batch_size:,}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}") logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") - logger.info(f" Total optimization steps = {max_steps}") + logger.info(f" Total optimization steps = {max_steps:,}") + logger.info(f" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}") self.state.epoch = 0 start_time = time.time() @@ -306,20 +391,19 @@ def _inner_training_loop( logger.info(f" Continuing training from global step {self.state.global_step}") if not args.ignore_data_skip: logger.info( - f" Will skip the first {epochs_trained} epochs then the first {steps_trained_in_current_epoch} " - "batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` " - "flag to your launch command, but you will resume the training on data already seen by your model." + f" Will skip the first {epochs_trained} epochs then the first" + f" {steps_trained_in_current_epoch} batches in the first epoch." ) - if self.is_local_process_zero() and not args.disable_tqdm: - steps_trained_progress_bar = tqdm(total=steps_trained_in_current_epoch) - steps_trained_progress_bar.set_description("Skipping the first batches") # Update the references self.callback_handler.model = self.model self.callback_handler.optimizer = self.optimizer self.callback_handler.lr_scheduler = self.lr_scheduler self.callback_handler.train_dataloader = train_dataloader - self.state.trial_name = self.hp_name(trial) if self.hp_name is not None else None + if self.hp_name is not None and self._trial is not None: + # use self._trial because the SigOpt/Optuna hpo only call `_hp_search_setup(trial)` instead of passing trial + # parameter to Train when using DDP. + self.state.trial_name = self.hp_name(self._trial) if trial is not None: assignments = trial.assignments if self.hp_search_backend == HPSearchBackend.SIGOPT else trial self.state.trial_params = hp_params(assignments) @@ -347,26 +431,26 @@ def _inner_training_loop( # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point. if not args.ignore_data_skip: for epoch in range(epochs_trained): - is_random_sampler = hasattr(train_dataloader, "sampler") and isinstance( - train_dataloader.sampler, RandomSampler - ) + sampler = get_dataloader_sampler(train_dataloader) + sampler_kinds = [RandomSampler] + if version.parse(accelerate_version) > version.parse("0.23.0"): + sampler_kinds.append(SeedableRandomSampler) + is_random_sampler = isinstance(sampler, tuple(sampler_kinds)) if is_torch_less_than_1_11 or not is_random_sampler: # We just need to begin an iteration to create the randomization of the sampler. - # That was before PyTorch 1.11 however... for _ in train_dataloader: break else: # Otherwise we need to call the whooooole sampler cause there is some random operation added # AT THE VERY END! - _ = list(train_dataloader.sampler) + sampler = sampler if sampler is not None else [] + _ = list(sampler) + total_batched_samples = 0 for epoch in range(epochs_trained, num_train_epochs): - if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler): - train_dataloader.sampler.set_epoch(epoch) - elif hasattr(train_dataloader, "dataset") and isinstance(train_dataloader.dataset, IterableDatasetShard): - train_dataloader.dataset.set_epoch(epoch) - epoch_iterator = train_dataloader + if hasattr(epoch_iterator, "set_epoch"): + epoch_iterator.set_epoch(epoch) # Reset the past mems state at the beginning of each epoch if necessary. if args.past_index >= 0: @@ -385,8 +469,33 @@ def _inner_training_loop( if epoch == epochs_trained and resume_from_checkpoint is not None and steps_trained_in_current_epoch == 0: self._load_rng_state(resume_from_checkpoint) + rng_to_sync = False + steps_skipped = 0 + if steps_trained_in_current_epoch > 0: + epoch_iterator = skip_first_batches(epoch_iterator, steps_trained_in_current_epoch) + steps_skipped = steps_trained_in_current_epoch + steps_trained_in_current_epoch = 0 + rng_to_sync = True + step = -1 for step, inputs in enumerate(epoch_iterator): + total_batched_samples += 1 + + if is_transformers_version(">=", "4.36.0") and self.args.include_num_input_tokens_seen: + main_input_name = getattr(self.model, "main_input_name", "input_ids") + if main_input_name not in inputs: + logger.warning( + "Tried to track the number of tokens seen, however the current model is " + "not configured properly to know what item is the input. To fix this, add " + "a `main_input_name` attribute to the model class you are using." + ) + else: + self.state.num_input_tokens_seen += self.accelerator.gather(inputs[main_input_name]).numel() + + if rng_to_sync: + self._load_rng_state(resume_from_checkpoint) + rng_to_sync = False + # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 @@ -404,18 +513,14 @@ def _inner_training_loop( if self._compression_manager is not None: self._compression_manager.callbacks.on_step_begin(step) - if ( - ((step + 1) % args.gradient_accumulation_steps != 0) - and args.local_rank != -1 - and args._no_sync_in_gradient_accumulation - ): - # Avoid unnecessary DDP synchronization since there will be no backward pass on this example. - with model.no_sync(): - tr_loss_step = self.training_step(model, inputs) - else: + with self.accelerator.accumulate(model): tr_loss_step = self.training_step(model, inputs) - if args.logging_nan_inf_filter and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)): + if ( + args.logging_nan_inf_filter + and not is_torch_tpu_available() + and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) + ): # if loss is nan or inf simply add the average of previous logged losses tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged) else: @@ -423,35 +528,36 @@ def _inner_training_loop( self.current_flos += float(self.floating_point_ops(inputs)) - # Optimizer step for deepspeed must be called on every step regardless of the value of gradient_accumulation_steps - if self.deepspeed: - self.deepspeed.step() + is_last_step_and_steps_less_than_grad_acc = ( + steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch + ) - if (step + 1) % args.gradient_accumulation_steps == 0 or ( + if ( + total_batched_samples % args.gradient_accumulation_steps == 0 + or # last step in epoch but step is always smaller than gradient_accumulation_steps - steps_in_epoch <= args.gradient_accumulation_steps - and (step + 1) == steps_in_epoch + is_last_step_and_steps_less_than_grad_acc ): + # the `or` condition of `is_last_step_and_steps_less_than_grad_acc` is not covered + # in accelerate. So, explicitly enable sync gradients to True in that case. + if is_last_step_and_steps_less_than_grad_acc: + self.accelerator.gradient_state._set_sync_gradients(True) + # Gradient clipping - if args.max_grad_norm is not None and args.max_grad_norm > 0 and not self.deepspeed: + if args.max_grad_norm is not None and args.max_grad_norm > 0: # deepspeed does its own clipping - if self.do_grad_scaling: - # AMP: gradients need unscaling - self.scaler.unscale_(self.optimizer) - if is_sagemaker_mp_enabled() and args.fp16: self.optimizer.clip_master_grads(args.max_grad_norm) - elif hasattr(self.optimizer, "clip_grad_norm"): - # Some optimizers (like the sharded optimizer) have a specific way to do gradient clipping - self.optimizer.clip_grad_norm(args.max_grad_norm) - elif hasattr(model, "clip_grad_norm_"): - # Some models (like FullyShardedDDP) have a specific way to do gradient clipping - model.clip_grad_norm_(args.max_grad_norm) - else: + elif self.use_apex: # Revert to normal clipping otherwise, handling Apex or full precision nn.utils.clip_grad_norm_( - amp.master_params(self.optimizer) if self.use_apex else model.parameters(), + amp.master_params(self.optimizer), + args.max_grad_norm, + ) + else: + self.accelerator.clip_grad_norm_( + model.parameters(), args.max_grad_norm, ) @@ -459,27 +565,20 @@ def _inner_training_loop( self._compression_manager.callbacks.on_before_optimizer_step() # Optimizer step - optimizer_was_run = True - if self.deepspeed: - pass # called outside the loop - elif self.do_grad_scaling: - scale_before = self.scaler.get_scale() - self.scaler.step(self.optimizer) - self.scaler.update() - scale_after = self.scaler.get_scale() - optimizer_was_run = scale_before <= scale_after - else: - self.optimizer.step() + self.optimizer.step() if self._compression_manager is not None: self._compression_manager.callbacks.on_after_optimizer_step() - if optimizer_was_run and not self.deepspeed: - self.lr_scheduler.step() + optimizer_was_run = not self.accelerator.optimizer_step_was_skipped + if optimizer_was_run: + # Delay optimizer scheduling until metrics are generated + if not isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): + self.lr_scheduler.step() model.zero_grad() self.state.global_step += 1 - self.state.epoch = epoch + (step + 1) / steps_in_epoch + self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch self.control = self.callback_handler.on_step_end(args, self.state, self.control) if self._compression_manager is not None: self._compression_manager.callbacks.on_step_end() @@ -501,7 +600,6 @@ def _inner_training_loop( self.control = self.callback_handler.on_epoch_end(args, self.state, self.control) if self._compression_manager is not None: self._compression_manager.callbacks.on_epoch_end() - self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval) if self.control.should_training_stop: @@ -513,9 +611,10 @@ def _inner_training_loop( logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") if args.load_best_model_at_end and self.state.best_model_checkpoint is not None: - # Wait for everyone to get here so we are sur the model has been saved by process 0. - - if args.local_rank != -1: + # Wait for everyone to get here so we are sure the model has been saved by process 0. + if is_torch_tpu_available(): + xm.rendezvous("load_best_model_at_end") + elif args.parallel_mode == ParallelMode.DISTRIBUTED: dist.barrier() elif is_sagemaker_mp_enabled(): smp.barrier() @@ -526,7 +625,13 @@ def _inner_training_loop( self._total_loss_scalar += tr_loss.item() train_loss = self._total_loss_scalar / self.state.global_step - metrics = speed_metrics("train", start_time, num_samples=num_train_samples, num_steps=self.state.max_steps) + metrics = speed_metrics( + "train", + start_time, + num_samples=num_train_samples, + num_steps=self.state.max_steps, + num_tokens=num_train_tokens, + ) self.store_flos() metrics["total_flos"] = self.state.total_flos metrics["train_loss"] = train_loss @@ -537,7 +642,26 @@ def _inner_training_loop( self.log(metrics) + run_dir = self._get_output_dir(trial) + checkpoints_sorted = self._sorted_checkpoints(use_mtime=False, output_dir=run_dir) + + # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save. + if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1: + for checkpoint in checkpoints_sorted: + if not os.path.samefile(checkpoint, self.state.best_model_checkpoint): + logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit") + shutil.rmtree(checkpoint) + self.control = self.callback_handler.on_train_end(args, self.state, self.control) + + # Wait for the checkpoint to be uploaded. + self._finish_current_push() + + # After training we make sure to retrieve back the original forward pass method + # for the embedding layer by removing the forward post hook. + if self.neftune_noise_alpha is not None: + self._deactivate_neftune(self.model) + if self._compression_manager is not None: self._compression_manager.callbacks.on_train_end() diff --git a/optimum/intel/openvino/__init__.py b/optimum/intel/openvino/__init__.py index 6b023accda..6999c6b48f 100644 --- a/optimum/intel/openvino/__init__.py +++ b/optimum/intel/openvino/__init__.py @@ -46,11 +46,12 @@ OVModelForTokenClassification, ) from .modeling_decoder import OVModelForCausalLM -from .modeling_seq2seq import OVModelForPix2Struct, OVModelForSeq2SeqLM +from .modeling_seq2seq import OVModelForPix2Struct, OVModelForSeq2SeqLM, OVModelForSpeechSeq2Seq if is_diffusers_available(): from .modeling_diffusion import ( + OVLatentConsistencyModelPipeline, OVStableDiffusionImg2ImgPipeline, OVStableDiffusionInpaintPipeline, OVStableDiffusionPipeline, diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index a45ee281f6..37928289e4 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -107,7 +107,11 @@ def _enable_standard_onnx_export_option(self): # save_onnx_model is defaulted to false so that the final model output is # in OpenVINO IR to realize performance benefit in OpenVINO runtime. # True value of save_onnx_model will save a model in onnx format. - if isinstance(self.compression, dict) and self.compression["algorithm"] == "quantization": + if ( + isinstance(self.compression, dict) + and "algorithm" in self.compression + and self.compression["algorithm"] == "quantization" + ): self.compression["export_to_onnx_standard_ops"] = self.save_onnx_model elif isinstance(self.compression, list): for i, algo_config in enumerate(self.compression): diff --git a/optimum/intel/openvino/modeling.py b/optimum/intel/openvino/modeling.py index a0c753d94d..8cea5eb7b6 100644 --- a/optimum/intel/openvino/modeling.py +++ b/optimum/intel/openvino/modeling.py @@ -49,7 +49,7 @@ from optimum.exporters import TasksManager -from ..utils.import_utils import is_timm_available +from ..utils.import_utils import is_timm_available, is_timm_version from .modeling_base import OVBaseModel from .utils import _is_timm_ov_dir @@ -540,6 +540,11 @@ def from_pretrained( "To load a timm model, timm needs to be installed. Please install it with `pip install timm`." ) + if is_timm_version("<", "0.9.0"): + raise ImportError( + "To load a timm model, please make sure to upgrade your `timm` version to at least 0.9.0, you can upgrade it by running `pip install --upgrade timm`" + ) + from .modeling_timm import TimmConfig, TimmForImageClassification, TimmOnnxConfig config = TimmConfig.from_pretrained(model_id, **kwargs) diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 58eb2163d0..67e8d20502 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -15,7 +15,7 @@ import logging import os from pathlib import Path -from tempfile import TemporaryDirectory +from tempfile import TemporaryDirectory, gettempdir from typing import Dict, Optional, Union import openvino @@ -43,17 +43,12 @@ logger = logging.getLogger(__name__) -# workaround to enable compatibility between openvino models and transformers pipelines -class PreTrainedModel(OptimizedModel): - pass - - @add_start_docstrings( """ Base OVModel class. """, ) -class OVBaseModel(PreTrainedModel): +class OVBaseModel(OptimizedModel): auto_model_class = None export_feature = None @@ -79,7 +74,19 @@ def __init__( height = -1 if self.export_feature == "image-classification" else None width = -1 if self.export_feature == "image-classification" else None model = self._reshape(model, -1, -1, height, width) - self.input_names = {key.get_any_name(): idx for idx, key in enumerate(model.inputs)} + + input_names = {} + for idx, key in enumerate(model.inputs): + names = tuple(key.get_names()) + input_names[next((name for name in names if "/" not in name), names[0])] = idx + self.input_names = input_names + + output_names = {} + for idx, key in enumerate(model.outputs): + names = tuple(key.get_names()) + output_names[next((name for name in names if "/" not in name), names[0])] = idx + self.output_names = output_names + self.model = model self.request = None if enable_compilation: @@ -153,6 +160,7 @@ def _from_pretrained( force_download: bool = False, cache_dir: Optional[str] = None, file_name: Optional[str] = None, + subfolder: str = "", from_onnx: bool = False, local_files_only: bool = False, load_in_8bit: bool = False, @@ -184,38 +192,59 @@ def _from_pretrained( local_files_only(`bool`, *optional*, defaults to `False`): Whether or not to only look at local files (i.e., do not try to download the model). """ + + model_path = Path(model_id) default_file_name = ONNX_WEIGHTS_NAME if from_onnx else OV_XML_FILE_NAME file_name = file_name or default_file_name - # Load the model from local directory - if os.path.isdir(model_id): - file_name = os.path.join(model_id, file_name) - model_save_dir = model_id - # Download the model from the hub - else: - model_file_names = [file_name] - # If not ONNX then OpenVINO IR + model_cache_path = cls._cached_file( + model_path=model_path, + use_auth_token=use_auth_token, + revision=revision, + force_download=force_download, + cache_dir=cache_dir, + file_name=file_name, + subfolder=subfolder, + local_files_only=local_files_only, + ) + model = cls.load_model(model_cache_path, load_in_8bit=load_in_8bit) + return cls(model, config=config, model_save_dir=model_cache_path.parent, **kwargs) - if not from_onnx: - model_file_names.append(file_name.replace(".xml", ".bin")) - file_names = [] + @staticmethod + def _cached_file( + model_path: Union[Path, str], + use_auth_token: Optional[Union[bool, str]] = None, + revision: Optional[str] = None, + force_download: bool = False, + cache_dir: Optional[str] = None, + file_name: Optional[str] = None, + subfolder: str = "", + local_files_only: bool = False, + ): + # locates a file in a local folder and repo, downloads and cache it if necessary. + model_path = Path(model_path) + if model_path.is_dir(): + model_cache_path = model_path / file_name + else: + file_name = Path(file_name) + if file_name.suffix != "onnx": + model_file_names = [file_name.with_suffix(".bin"), file_name] + else: + model_file_names = [file_name] for file_name in model_file_names: model_cache_path = hf_hub_download( - repo_id=model_id, - filename=file_name, + repo_id=model_path.as_posix(), + filename=file_name.as_posix(), + subfolder=subfolder, use_auth_token=use_auth_token, revision=revision, cache_dir=cache_dir, force_download=force_download, local_files_only=local_files_only, ) - file_names.append(model_cache_path) - model_save_dir = Path(model_cache_path).parent - file_name = file_names[0] - - model = cls.load_model(file_name, load_in_8bit=load_in_8bit) + model_cache_path = Path(model_cache_path) - return cls(model, config=config, model_save_dir=model_save_dir, **kwargs) + return model_cache_path @classmethod def _from_transformers( @@ -274,7 +303,7 @@ def _from_transformers( @classmethod def _to_load( cls, - model: PreTrainedModel, + model, config: PretrainedConfig, onnx_config: OnnxConfig, use_auth_token: Optional[Union[bool, str]] = None, @@ -311,11 +340,11 @@ def compile(self): if self.request is None: logger.info(f"Compiling the model to {self._device} ...") ov_config = {**self.ov_config} - if "CACHE_DIR" not in self.ov_config.keys(): - # Set default CACHE_DIR only if it is not set. + if "CACHE_DIR" not in self.ov_config.keys() and not str(self.model_save_dir).startswith(gettempdir()): + # Set default CACHE_DIR only if it is not set, and if the model is not in a temporary directory cache_dir = Path(self.model_save_dir).joinpath("model_cache") ov_config["CACHE_DIR"] = str(cache_dir) - logger.info(f"Set CACHE_DIR to {str(cache_dir)}") + logger.info(f"Setting OpenVINO CACHE_DIR to {str(cache_dir)}") self.request = core.compile_model(self.model, self._device, ov_config) def _reshape( diff --git a/optimum/intel/openvino/modeling_base_seq2seq.py b/optimum/intel/openvino/modeling_base_seq2seq.py index 527adc4347..3471c6f954 100644 --- a/optimum/intel/openvino/modeling_base_seq2seq.py +++ b/optimum/intel/openvino/modeling_base_seq2seq.py @@ -68,8 +68,6 @@ def __init__( self.ov_config = ov_config if ov_config is not None else {} self.preprocessors = kwargs.get("preprocessors", []) - if "GPU" in self._device: - raise ValueError("Support of dynamic shapes for GPU devices is not yet available.") if self.is_dynamic: encoder = self._reshape(encoder, -1, -1, is_decoder=False) decoder = self._reshape(decoder, -1, -1) @@ -264,7 +262,7 @@ def _from_transformers( local_files_only=local_files_only, force_download=force_download, trust_remote_code=trust_remote_code, - int8=load_in_8bit, + compression_option="int8" if load_in_8bit else None, ) config.save_pretrained(save_dir_path) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 2645408158..8147cc74e8 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -31,8 +31,9 @@ from ...exporters.openvino import main_export from ..utils.import_utils import is_transformers_version +from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS from .modeling import _TOKENIZER_FOR_DOC, INPUTS_DOCSTRING, MODEL_START_DOCSTRING, OVModel -from .utils import OV_XML_FILE_NAME, STR_TO_OV_TYPE +from .utils import ONNX_WEIGHTS_NAME, OV_XML_FILE_NAME, STR_TO_OV_TYPE if is_transformers_version("<", "4.25.0"): @@ -78,8 +79,9 @@ "bloom", "codegen", "gpt2", - "gpt_neo", - "gpt_neox", + "gpt-bigcode", + "gpt-neo", + "gpt-neox", "llama", "marian", "opt", @@ -127,7 +129,6 @@ def __init__( self.main_input_name = "input_ids" self.num_pkv = 2 self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config) - self.output_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.outputs)} self.key_value_input_names = [key for key in self.input_names if "key_values" in key] self.key_value_output_names = [key for key in self.output_names if "present" in key] self._original_model = self.model.clone() # keep original model for serialization @@ -213,7 +214,7 @@ def _from_transformers( load_in_8bit: bool = False, **kwargs, ): - if config.model_type not in _SUPPORTED_ARCHITECTURES: + if config.model_type.replace("_", "-") not in _SUPPORTED_ARCHITECTURES: logger.warning( f"This architecture : {config.model_type} was not validated, only :{', '.join(_SUPPORTED_ARCHITECTURES)} architectures were " "validated, use at your own risk." @@ -227,34 +228,6 @@ def _from_transformers( if use_cache: task = task + "-with-past" - # Patch the modules to export of GPTQ models w/o GPU - do_gptq_patching = False - config_dict = config.to_dict() - quantization_config = config_dict.get("quantization_config", None) - do_gptq_patching = quantization_config and quantization_config["quant_method"] == "gptq" - if do_gptq_patching: - torch.set_default_dtype(torch.float32) - orig_cuda_check = torch.cuda.is_available - torch.cuda.is_available = lambda: True - - from optimum.gptq import GPTQQuantizer - - orig_post_init_model = GPTQQuantizer.post_init_model - - def post_init_model(self, model): - from auto_gptq import exllama_set_max_input_length - - class StoreAttr(object): - pass - - model.quantize_config = StoreAttr() - model.quantize_config.desc_act = self.desc_act - if self.desc_act and not self.disable_exllama and self.max_input_length is not None: - model = exllama_set_max_input_length(model, self.max_input_length) - return model - - GPTQQuantizer.post_init_model = post_init_model - main_export( model_name_or_path=model_id, output=save_dir_path, @@ -266,14 +239,9 @@ class StoreAttr(object): local_files_only=local_files_only, force_download=force_download, trust_remote_code=trust_remote_code, - int8=load_in_8bit, + compression_option="int8" if load_in_8bit else None, ) - # Unpatch modules after GPTQ export - if do_gptq_patching: - torch.cuda.is_available = orig_cuda_check - GPTQQuantizer.post_init_model = orig_post_init_model - config.is_decoder = True config.is_encoder_decoder = False config.save_pretrained(save_dir_path) @@ -344,9 +312,11 @@ def forward( input_ids: torch.LongTensor, attention_mask: Optional[torch.LongTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + position_ids: Optional[torch.LongTensor] = None, **kwargs, ) -> CausalLMOutputWithPast: self.compile() + inputs = {} if self.use_cache and past_key_values is not None: input_ids = input_ids[:, -1:] @@ -354,63 +324,79 @@ def forward( inputs = {} past_len = 0 if past_key_values is not None: - seq_len_dim = 1 if self.model.input(self.key_value_input_names[0]).get_partial_shape()[1].is_dynamic else 2 - past_len = past_key_values[0][0].shape[seq_len_dim] - if self._pkv_precision == Type.bf16: - # numpy does not support bf16, pretending f16, should change to bf16 - past_key_values = tuple( - Tensor(past_key_value, past_key_value.shape, Type.bf16) - for pkv_per_layer in past_key_values - for past_key_value in pkv_per_layer - ) + if self.config.model_type not in MULTI_QUERY_ATTN_MODELS: + past_len = past_key_values[0][1].shape[-2] + if self._pkv_precision == Type.bf16: + # numpy does not support bf16, pretending f16, should change to bf16 + past_key_values = tuple( + Tensor(past_key_value, past_key_value.shape, Type.bf16) + for pkv_per_layer in past_key_values + for past_key_value in pkv_per_layer + ) + else: + # Flatten the past_key_values + past_key_values = tuple( + past_key_value for pkv_per_layer in past_key_values for past_key_value in pkv_per_layer + ) else: - # Flatten the past_key_values - past_key_values = tuple( - past_key_value for pkv_per_layer in past_key_values for past_key_value in pkv_per_layer - ) + past_len = past_key_values[0].shape[-2] + # Add the past_key_values to the decoder inputs inputs = dict(zip(self.key_value_input_names, past_key_values)) # Create empty past_key_values for decoder_with_past first generation step elif self.use_cache: - shape_input_ids = input_ids.shape - num_attention_heads = ( - self.normalized_config.num_attention_heads if self.config.model_type == "bloom" else 1 - ) + batch_size = input_ids.shape[0] + if self.config.model_type == "bloom": + batch_size *= self.normalized_config.num_attention_heads + for input_name in self.key_value_input_names: model_inputs = self.model.input(input_name) shape = model_inputs.get_partial_shape() - shape[0] = shape_input_ids[0] * num_attention_heads + shape[0] = batch_size if shape[2].is_dynamic: shape[2] = 0 - if shape[1].is_dynamic: + else: shape[1] = 0 inputs[input_name] = Tensor(model_inputs.get_element_type(), shape.get_shape()) inputs["input_ids"] = np.array(input_ids) - # Add the attention_mask inputs when needed - if "attention_mask" in self.input_names: + if "attention_mask" in self.input_names or "position_ids" in self.input_names: if attention_mask is not None: - inputs["attention_mask"] = np.array(attention_mask) + attention_mask = np.array(attention_mask) else: - inputs["attention_mask"] = np.ones( + attention_mask = np.ones( (input_ids.shape[0], input_ids.shape[1] + past_len), dtype=inputs["input_ids"].dtype ) + if "attention_mask" in self.input_names: + inputs["attention_mask"] = attention_mask + + if "position_ids" in self.input_names: + if position_ids is not None: + position_ids = np.array(position_ids) + else: + position_ids = np.cumsum(attention_mask, axis=1) - 1 + position_ids[attention_mask == 0] = 1 + if past_key_values: + position_ids = np.expand_dims(position_ids[:, -1], axis=-1) + + inputs["position_ids"] = position_ids + # Run inference self.request.start_async(inputs, shared_memory=True) self.request.wait() - logits = torch.from_numpy(self.request.get_tensor("logits").data).to(self.device) if self.use_cache: # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 corresponds to the self-attention layer) past_key_values = tuple(self.request.get_tensor(key).data for key in self.key_value_output_names) - # Tuple of tuple of length `n_layers`, with each tuple of length equal to 2 (k/v of self-attention) - past_key_values = tuple( - past_key_values[i : i + self.num_pkv] for i in range(0, len(past_key_values), self.num_pkv) - ) + if self.config.model_type not in MULTI_QUERY_ATTN_MODELS: + # Tuple of tuple of length `n_layers`, with each tuple of length equal to 2 (k/v of self-attention) + past_key_values = tuple( + past_key_values[i : i + self.num_pkv] for i in range(0, len(past_key_values), self.num_pkv) + ) else: past_key_values = None @@ -418,41 +404,114 @@ def forward( # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel.prepare_inputs_for_generation def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): - past_key_values = past_key_values or kwargs.get("past", None) - - # `past_key_values` may be in the stardard format (e.g. in contrastive search), converts to bloom's format if needed - if past_key_values is not None and self.config.model_type == "bloom": - if past_key_values[0][0].shape[0] == input_ids.shape[0]: - past_key_values = self._convert_to_bloom_cache(past_key_values) + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + attention_mask = kwargs.get("attention_mask", None) + use_cache = kwargs.get("use_cache", None) + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -1].unsqueeze(-1) return { "input_ids": input_ids, "past_key_values": past_key_values, - "use_cache": self.use_cache, - "position_ids": None, - "attention_mask": kwargs.get("attention_mask", None), - "token_type_ids": None, + "use_cache": use_cache, + "position_ids": position_ids, + "attention_mask": attention_mask, } + # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel._reorder_cache + @staticmethod def _reorder_cache( - self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor + past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor ) -> Tuple[Tuple[torch.Tensor]]: """ This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct beam_idx at every generation step. """ - - if self.config.model_type == "bloom": - return self._reorder_cache_bloom(past_key_values, beam_idx) - - # from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel._reorder_cache return tuple( tuple(np.take(past_state, beam_idx, 0) for past_state in layer_past) for layer_past in past_key_values ) - # Copied from transformers.models.bloom.modeling_bloom.BloomForCausalLM._reorder_cache - def _reorder_cache_bloom( + def can_generate(self): + """Returns True to validate the check that the model using `GenerationMixin.generate()` can indeed generate.""" + return True + + @classmethod + def _from_pretrained( + cls, + model_id: Union[str, Path], + config: PretrainedConfig, + use_auth_token: Optional[Union[bool, str, None]] = None, + revision: Optional[Union[str, None]] = None, + force_download: bool = False, + cache_dir: Optional[str] = None, + file_name: Optional[str] = None, + subfolder: str = "", + from_onnx: bool = False, + local_files_only: bool = False, + load_in_8bit: bool = False, + **kwargs, + ): + model_path = Path(model_id) + default_file_name = ONNX_WEIGHTS_NAME if from_onnx else OV_XML_FILE_NAME + file_name = file_name or default_file_name + + model_cache_path = cls._cached_file( + model_path=model_path, + use_auth_token=use_auth_token, + revision=revision, + force_download=force_download, + cache_dir=cache_dir, + file_name=file_name, + subfolder=subfolder, + local_files_only=local_files_only, + ) + + model = cls.load_model(model_cache_path, load_in_8bit=load_in_8bit) + + model_type = config.model_type.replace("_", "-") + if model_type == "bloom": + init_cls = OVBloomForCausalLM + elif model_type == "mpt": + init_cls = OVMPTForCausalLM + elif model_type == "opt": + init_cls = OVOPTForCausalLM + elif model_type == "gpt-bigcode": + init_cls = OVGPTBigCodeForCausalLM + else: + init_cls = cls + + return init_cls(model=model, config=config, model_save_dir=model_cache_path.parent, **kwargs) + + +class OVBloomForCausalLM(OVModelForCausalLM): + # Adapted from transformers.models.bloom.modeling_bloom.BloomForCausalLM.prepare_inputs_for_generation + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): + attention_mask = kwargs.get("attention_mask", None) + use_cache = kwargs.get("use_cache", None) + + # only last token for input_ids if past is not None + if past_key_values: + # the cache may be in the stardard format (e.g. in contrastive search), convert to bloom's format if needed + if past_key_values[0][0].shape[0] == input_ids.shape[0]: + past_key_values = self._convert_to_bloom_cache(past_key_values) + + return { + "input_ids": input_ids, + "past_key_values": past_key_values, + "use_cache": use_cache, + "position_ids": None, + "attention_mask": attention_mask, + } + + # Adapted from transformers.models.bloom.modeling_bloom.BloomForCausalLM._reorder_cache + def _reorder_cache( self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor ) -> Tuple[Tuple[torch.Tensor]]: """ @@ -461,7 +520,6 @@ def _reorder_cache_bloom( This is required to match `past_key_values` with the correct beam_idx at every generation step. """ standardized_past = self._convert_to_standard_cache(past_key_values, batch_size=len(beam_idx)) - reordered_past = tuple( ( np.take(layer_past[0], beam_idx, 0), @@ -496,9 +554,6 @@ def _convert_to_standard_cache( """ Standardizes the format of the cache so as to match most implementations, i.e. to tuple(tuple([batch_size, num_heads, ...])) """ - if self.config.model_type != "bloom": - return past_key_value - batch_size_times_num_heads, head_dim, seq_length = past_key_value[0][0].shape num_heads = batch_size_times_num_heads // batch_size # key: [batch_size * num_heads, head_dim, seq_length] -> [batch_size, num_heads, head_dim, seq_length] @@ -511,6 +566,41 @@ def _convert_to_standard_cache( for layer_past in past_key_value ) - def can_generate(self): - """Returns True to validate the check that the model using `GenerationMixin.generate()` can indeed generate.""" - return True + +class OVOPTForCausalLM(OVModelForCausalLM): + # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel.prepare_inputs_for_generation + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): + attention_mask = kwargs.get("attention_mask", None) + use_cache = kwargs.get("use_cache", None) + + return { + "input_ids": input_ids, + "past_key_values": past_key_values, + "use_cache": use_cache, + "position_ids": None, + "attention_mask": attention_mask, + } + + +class OVMPTForCausalLM(OVModelForCausalLM): + # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel.prepare_inputs_for_generation + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): + attention_mask = kwargs.get("attention_mask", None) + use_cache = kwargs.get("use_cache", None) + + return { + "input_ids": input_ids, + "past_key_values": past_key_values, + "use_cache": use_cache, + "position_ids": None, + "attention_mask": attention_mask, + } + + +class OVGPTBigCodeForCausalLM(OVModelForCausalLM): + # Adapted from transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeForCausalLM._reorder_cache + @staticmethod + def _reorder_cache( + past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor + ) -> Tuple[Tuple[torch.Tensor]]: + return tuple(np.take(layer_past, beam_idx, 0) for layer_past in past_key_values) diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py index 1ca0b93643..4443381cd6 100644 --- a/optimum/intel/openvino/modeling_diffusion.py +++ b/optimum/intel/openvino/modeling_diffusion.py @@ -17,7 +17,7 @@ import os import shutil from pathlib import Path -from tempfile import TemporaryDirectory +from tempfile import TemporaryDirectory, gettempdir from typing import Any, Dict, List, Optional, Union import numpy as np @@ -37,6 +37,7 @@ from openvino.runtime import Core from transformers import CLIPFeatureExtractor, CLIPTokenizer +from optimum.pipelines.diffusers.pipeline_latent_consistency import LatentConsistencyPipelineMixin from optimum.pipelines.diffusers.pipeline_stable_diffusion import StableDiffusionPipelineMixin from optimum.pipelines.diffusers.pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipelineMixin from optimum.pipelines.diffusers.pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipelineMixin @@ -69,16 +70,16 @@ class OVStableDiffusionPipelineBase(OVBaseModel, OVTextualInversionLoaderMixin): def __init__( self, - vae_decoder: openvino.runtime.Model, - text_encoder: openvino.runtime.Model, unet: openvino.runtime.Model, config: Dict[str, Any], - tokenizer: "CLIPTokenizer", scheduler: Union["DDIMScheduler", "PNDMScheduler", "LMSDiscreteScheduler"], - feature_extractor: Optional["CLIPFeatureExtractor"] = None, + vae_decoder: Optional[openvino.runtime.Model] = None, vae_encoder: Optional[openvino.runtime.Model] = None, + text_encoder: Optional[openvino.runtime.Model] = None, text_encoder_2: Optional[openvino.runtime.Model] = None, + tokenizer: Optional["CLIPTokenizer"] = None, tokenizer_2: Optional["CLIPTokenizer"] = None, + feature_extractor: Optional["CLIPFeatureExtractor"] = None, device: str = "CPU", dynamic_shapes: bool = True, compile: bool = True, @@ -270,20 +271,7 @@ def _from_pretrained( if model_save_dir is None: model_save_dir = new_model_save_dir - return cls( - vae_decoder=components["vae_decoder"], - text_encoder=components["text_encoder"], - unet=unet, - config=config, - tokenizer=kwargs.pop("tokenizer", None), - scheduler=kwargs.pop("scheduler"), - feature_extractor=kwargs.pop("feature_extractor", None), - vae_encoder=components["vae_encoder"], - text_encoder_2=components["text_encoder_2"], - tokenizer_2=kwargs.pop("tokenizer_2", None), - model_save_dir=model_save_dir, - **kwargs, - ) + return cls(unet=unet, config=config, model_save_dir=model_save_dir, **components, **kwargs) @classmethod def _from_transformers( @@ -295,10 +283,11 @@ def _from_transformers( force_download: bool = False, cache_dir: Optional[str] = None, local_files_only: bool = False, - tokenizer: "CLIPTokenizer" = None, + tokenizer: Optional["CLIPTokenizer"] = None, scheduler: Union["DDIMScheduler", "PNDMScheduler", "LMSDiscreteScheduler"] = None, feature_extractor: Optional["CLIPFeatureExtractor"] = None, load_in_8bit: bool = False, + tokenizer_2: Optional["CLIPTokenizer"] = None, **kwargs, ): save_dir = TemporaryDirectory() @@ -329,6 +318,7 @@ def _from_transformers( local_files_only=local_files_only, model_save_dir=save_dir, tokenizer=tokenizer, + tokenizer_2=tokenizer_2, scheduler=scheduler, feature_extractor=feature_extractor, load_in_8bit=load_in_8bit, @@ -377,8 +367,10 @@ def _reshape_unet( if batch_size == -1 or num_images_per_prompt == -1: batch_size = -1 else: + batch_size *= num_images_per_prompt # The factor of 2 comes from the guidance scale > 1 - batch_size = 2 * batch_size * num_images_per_prompt + if "timestep_cond" not in {inputs.get_any_name() for inputs in model.inputs}: + batch_size *= 2 height = height // self.vae_scale_factor if height > 0 else height width = width // self.vae_scale_factor if width > 0 else width @@ -402,6 +394,8 @@ def _reshape_unet( shapes[inputs] = [batch_size, self.text_encoder_2.config["projection_dim"]] elif inputs.get_any_name() == "time_ids": shapes[inputs] = [batch_size, inputs.get_partial_shape()[1]] + elif inputs.get_any_name() == "timestep_cond": + shapes[inputs] = [batch_size, self.unet.config["time_cond_proj_dim"]] else: shapes[inputs][0] = batch_size shapes[inputs][1] = tokenizer_max_length @@ -539,10 +533,8 @@ def __init__( self._model_dir = Path(model_dir or parent_model._model_save_dir) config_path = self._model_dir / model_name / self.CONFIG_NAME self.config = self.parent_model._dict_from_json_file(config_path) if config_path.is_file() else {} - - # TODO : disable if self._model_dir tmp directory - if "CACHE_DIR" not in self.ov_config: - self.ov_config["CACHE_DIR"] = os.path.join(self._model_dir, self._model_name) + if "CACHE_DIR" not in self.ov_config.keys() and not str(self._model_dir).startswith(gettempdir()): + self.ov_config["CACHE_DIR"] = os.path.join(self._model_dir, self._model_name, "model_cache") def _compile(self): if self.request is None: @@ -587,6 +579,7 @@ def __call__( encoder_hidden_states: np.ndarray, text_embeds: Optional[np.ndarray] = None, time_ids: Optional[np.ndarray] = None, + timestep_cond: Optional[np.ndarray] = None, ): self._compile() @@ -600,6 +593,8 @@ def __call__( inputs["text_embeds"] = text_embeds if time_ids is not None: inputs["time_ids"] = time_ids + if timestep_cond is not None: + inputs["timestep_cond"] = timestep_cond outputs = self.request(inputs, shared_memory=True) return list(outputs.values()) @@ -932,6 +927,61 @@ def __call__( ) +class OVLatentConsistencyModelPipeline(OVStableDiffusionPipelineBase, LatentConsistencyPipelineMixin): + def __call__( + self, + prompt: Optional[Union[str, List[str]]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 4, + original_inference_steps: int = None, + guidance_scale: float = 8.5, + num_images_per_prompt: int = 1, + **kwargs, + ): + height = height or self.unet.config["sample_size"] * self.vae_scale_factor + width = width or self.unet.config["sample_size"] * self.vae_scale_factor + _height = self.height + _width = self.width + expected_batch_size = self._batch_size + + if _height != -1 and height != _height: + logger.warning( + f"`height` was set to {height} but the static model will output images of height {_height}." + "To fix the height, please reshape your model accordingly using the `.reshape()` method." + ) + height = _height + + if _width != -1 and width != _width: + logger.warning( + f"`width` was set to {width} but the static model will output images of width {_width}." + "To fix the width, please reshape your model accordingly using the `.reshape()` method." + ) + width = _width + + if expected_batch_size != -1: + if isinstance(prompt, str): + batch_size = 1 + elif isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = kwargs.get("prompt_embeds").shape[0] + + _raise_invalid_batch_size(expected_batch_size, batch_size, num_images_per_prompt, guidance_scale=0.0) + + return LatentConsistencyPipelineMixin.__call__( + self, + prompt=prompt, + height=height, + width=width, + num_inference_steps=num_inference_steps, + original_inference_steps=original_inference_steps, + guidance_scale=guidance_scale, + num_images_per_prompt=num_images_per_prompt, + **kwargs, + ) + + def _raise_invalid_batch_size( expected_batch_size: int, batch_size: int, num_images_per_prompt: int, guidance_scale: float ): diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py index 87cd18d875..d43dbf3427 100644 --- a/optimum/intel/openvino/modeling_seq2seq.py +++ b/optimum/intel/openvino/modeling_seq2seq.py @@ -12,18 +12,28 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import logging from pathlib import Path -from typing import Dict, Optional, Tuple +from tempfile import gettempdir +from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union import numpy as np import openvino import torch import transformers from openvino.runtime import Core -from transformers import AutoConfig, AutoModelForSeq2SeqLM, Pix2StructForConditionalGeneration +from transformers import ( + AutoConfig, + AutoModelForSeq2SeqLM, + AutoModelForSpeechSeq2Seq, + Pix2StructForConditionalGeneration, + WhisperForConditionalGeneration, +) from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward +from transformers.generation.logits_process import WhisperTimeStampLogitsProcessor from transformers.modeling_outputs import BaseModelOutput, Seq2SeqLMOutput +from transformers.models.whisper.tokenization_whisper import TASK_IDS, TO_LANGUAGE_CODE from ..utils.import_utils import is_transformers_version from .modeling_base_seq2seq import OVBaseModelForSeq2SeqLM @@ -34,6 +44,9 @@ else: from transformers.generation import GenerationMixin +if TYPE_CHECKING: + from transformers import PretrainedConfig + core = Core() logger = logging.getLogger(__name__) @@ -175,6 +188,56 @@ ``` """ +SPEECH_SEQ2SEQ_MODEL_DOCSTRING = r""" + Args: + input_features (`torch.FloatTensor`): + Mel features extracted from the raw speech waveform. + `(batch_size, feature_size, encoder_sequence_length)`. + decoder_input_ids (`torch.LongTensor`): + Indices of decoder input sequence tokens in the vocabulary of shape `(batch_size, decoder_sequence_length)`. + encoder_outputs (`torch.FloatTensor`): + The encoder `last_hidden_state` of shape `(batch_size, encoder_sequence_length, hidden_size)`. + past_key_values (`tuple(tuple(torch.FloatTensor), *optional*, defaults to `None`)` + Contains the precomputed key and value hidden states of the attention blocks used to speed up decoding. + The tuple is of length `config.n_layers` with each tuple having 2 tensors of shape + `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)` and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. +""" + +AUTOMATIC_SPEECH_RECOGNITION_EXAMPLE = r""" + Example of text generation: + + ```python + >>> from transformers import {processor_class} + >>> from optimum.intel.openvino import {model_class} + >>> from datasets import load_dataset + + >>> processor = {processor_class}.from_pretrained("{checkpoint}") + >>> model = {model_class}.from_pretrained("{checkpoint}") + + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> inputs = processor.feature_extractor(ds[0]["audio"]["array"], return_tensors="pt") + + >>> gen_tokens = model.generate(inputs=inputs.input_features) + >>> outputs = processor.tokenizer.batch_decode(gen_tokens) + ``` + + Example using `transformers.pipeline`: + + ```python + >>> from transformers import {processor_class}, pipeline + >>> from optimum.intel.openvino import {model_class} + >>> from datasets import load_dataset + + >>> processor = {processor_class}.from_pretrained("{checkpoint}") + >>> model = {model_class}.from_pretrained("{checkpoint}") + >>> speech_recognition = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor) + + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + >>> pred = speech_recognition(ds[0]["audio"]["array"]) + ``` +""" + @add_start_docstrings( """ @@ -202,31 +265,32 @@ def __init__( self.decoder_with_past = None enable_compilation = kwargs.get("compile", True) encoder_cache_dir = Path(self.model_save_dir).joinpath("encoder_cache") - encoder_cache_dir.mkdir(parents=True, exist_ok=True) - ov_encoder_config = ( - {**self.ov_config} - if "CACHE_DIR" in self.ov_config.keys() - else {**self.ov_config, "CACHE_DIR": str(encoder_cache_dir)} - ) + ov_encoder_config = {**self.ov_config} + + if "CACHE_DIR" not in ov_encoder_config.keys() and not str(self.model_save_dir).startswith(gettempdir()): + ov_encoder_config["CACHE_DIR"] = str(encoder_cache_dir) + self.encoder = OVEncoder( self.encoder_model, self._device, ov_encoder_config, main_input_name=self.main_input_name ) + decoder_cache_dir = Path(self.model_save_dir).joinpath("decoder_cache") - decoder_cache_dir.mkdir(parents=True, exist_ok=True) - ov_decoder_config = ( - {**self.ov_config} - if "CACHE_DIR" in self.ov_config.keys() - else {**self.ov_config, "CACHE_DIR": str(decoder_cache_dir)} - ) + ov_decoder_config = {**self.ov_config} + + if "CACHE_DIR" not in ov_decoder_config.keys() and not str(self.model_save_dir).startswith(gettempdir()): + ov_decoder_config["CACHE_DIR"] = str(decoder_cache_dir) + self.decoder = OVDecoder(self.decoder_model, self._device, ov_decoder_config) + if self.use_cache: decoder_past_cache_dir = Path(self.model_save_dir).joinpath("decoder_past_cache") - decoder_past_cache_dir.mkdir(parents=True, exist_ok=True) - ov_decoder_past_config = ( - {**self.ov_config} - if "CACHE_DIR" in self.ov_config.keys() - else {**self.ov_config, "CACHE_DIR": str(decoder_past_cache_dir)} - ) + ov_decoder_past_config = {**self.ov_config} + + if "CACHE_DIR" not in ov_decoder_past_config.keys() and not str(self.model_save_dir).startswith( + gettempdir() + ): + ov_decoder_past_config["CACHE_DIR"] = str(decoder_past_cache_dir) + self.decoder_with_past = OVDecoder(self.decoder_with_past_model, self._device, ov_decoder_past_config) if enable_compilation: self.compile() @@ -260,6 +324,7 @@ def forward( input_ids: torch.LongTensor = None, attention_mask: Optional[torch.FloatTensor] = None, decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, **kwargs, @@ -274,6 +339,7 @@ def forward( input_ids=decoder_input_ids, encoder_hidden_states=encoder_outputs.last_hidden_state, encoder_attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, ) else: decoder_outputs = self.decoder_with_past( @@ -281,6 +347,7 @@ def forward( past_key_values=past_key_values, encoder_hidden_states=encoder_outputs.last_hidden_state, encoder_attention_mask=attention_mask, + decoder_attention_mask=decoder_attention_mask, ) return Seq2SeqLMOutput(logits=decoder_outputs.logits, past_key_values=decoder_outputs.past_key_values) @@ -392,9 +459,9 @@ def forward( inputs["attention_mask"] = attention_mask # Run inference - last_hidden_state = torch.from_numpy(self.request(inputs, shared_memory=True)["last_hidden_state"]).to( - self.device - ) + last_hidden_state = torch.from_numpy( + self.request(inputs, share_inputs=True, share_outputs=True)["last_hidden_state"] + ).to(self.device) return BaseModelOutput(last_hidden_state=last_hidden_state) @@ -473,7 +540,7 @@ def forward( if "decoder_attention_mask" in self.input_names and decoder_attention_mask is not None: inputs["decoder_attention_mask"] = decoder_attention_mask # Run inference - self.request.start_async(inputs, shared_memory=True) + self.request.start_async(inputs, share_inputs=True) self.request.wait() logits = torch.from_numpy(self.request.get_tensor("logits").data).to(self.device) @@ -565,35 +632,14 @@ def forward( past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, **kwargs, ) -> Seq2SeqLMOutput: - # Encode if needed : first prediction pass - # Encode if needed (training, first prediction pass) - if encoder_outputs is None: - encoder_outputs = self.encoder( - input_ids=flattened_patches, - attention_mask=attention_mask, - ) - - # Decode - if past_key_values is None or self.use_cache is False: - decoder_outputs = self.decoder( - input_ids=decoder_input_ids, - decoder_attention_mask=decoder_attention_mask, - past_key_values=past_key_values, - encoder_hidden_states=encoder_outputs.last_hidden_state, - encoder_attention_mask=attention_mask, - ) - else: - decoder_outputs = self.decoder_with_past( - input_ids=decoder_input_ids[:, -1:], # Cut decoder_input_ids if past is used - decoder_attention_mask=decoder_attention_mask, - past_key_values=past_key_values, - encoder_hidden_states=encoder_outputs.last_hidden_state, - encoder_attention_mask=attention_mask, - ) - - return Seq2SeqLMOutput( - logits=decoder_outputs.logits, - past_key_values=decoder_outputs.past_key_values, + return super().forward( + input_ids=flattened_patches, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + encoder_outputs=encoder_outputs, + past_key_values=past_key_values, + **kwargs, ) def _reshape(self, model: openvino.runtime.Model, batch_size: int, sequence_length: int, is_decoder=True): @@ -608,3 +654,513 @@ def _reshape(self, model: openvino.runtime.Model, batch_size: int, sequence_leng shapes[inputs][1] = -1 model.reshape(shapes) return model + + +@add_start_docstrings( + """ + Speech Sequence-to-sequence model with a language modeling head for OpenVINO inference. This class officially supports whisper, speech_to_text. + """, + INPUTS_DOCSTRING, +) +class OVModelForSpeechSeq2Seq(OVModelForSeq2SeqLM): + auto_model_class = AutoModelForSpeechSeq2Seq + main_input_name = "input_features" + export_feature = "automatic-speech-recognition" + + def prepare_inputs_for_generation( + self, + input_ids, + attention_mask: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.BoolTensor] = None, + past_key_values=None, + head_mask=None, + decoder_head_mask=None, + cross_attn_head_mask=None, + use_cache=None, + encoder_outputs=None, + **kwargs, + ) -> Dict: + if decoder_attention_mask is None: + decoder_attention_mask = torch.ones_like(input_ids).to(input_ids.device) + + return { + "decoder_input_ids": input_ids, + "past_key_values": past_key_values, + "encoder_outputs": encoder_outputs, + "attention_mask": attention_mask, + "decoder_attention_mask": decoder_attention_mask, + "head_mask": head_mask, + "decoder_head_mask": decoder_head_mask, + "cross_attn_head_mask": cross_attn_head_mask, + "use_cache": use_cache, + } + + @add_start_docstrings_to_model_forward( + SPEECH_SEQ2SEQ_MODEL_DOCSTRING + + AUTOMATIC_SPEECH_RECOGNITION_EXAMPLE.format( + processor_class=_PROCESSOR_FOR_DOC, + model_class="OVModelForSpeechSeq2Seq", + checkpoint="openai/whisper-tiny", + ) + ) + def forward( + self, + input_features: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.BoolTensor] = None, + encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + **kwargs, + ) -> Seq2SeqLMOutput: + return super().forward( + input_ids=input_features, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + encoder_outputs=encoder_outputs, + past_key_values=past_key_values, + **kwargs, + ) + + @classmethod + def _from_pretrained( + cls, + model_id: Union[str, Path], + config: "PretrainedConfig", + **kwargs, + ): + if "WhisperForConditionalGeneration" in config.architectures: + return _OVModelForWhisper._from_pretrained(model_id, config, **kwargs) + else: + return super()._from_pretrained(model_id, config, **kwargs) + + +class _OVModelForWhisper(OVModelForSpeechSeq2Seq): + """ + Whisper implements its own generate() method. + """ + + auto_model_class = WhisperForConditionalGeneration + + @classmethod + def _from_pretrained( + cls, + model_id: Union[str, Path], + config: "PretrainedConfig", + **kwargs, + ): + return super(OVModelForSpeechSeq2Seq, cls)._from_pretrained(model_id, config, **kwargs) + + # Adapted from transformers.models.whisper.modeling_whisper + def generate( + self, + input_features: Optional[torch.Tensor] = None, + generation_config=None, + logits_processor=None, + stopping_criteria=None, + prefix_allowed_tokens_fn=None, + synced_gpus=False, + return_timestamps=None, + task=None, + language=None, + is_multilingual=None, + prompt_ids: Optional[torch.Tensor] = None, + num_segment_frames: Optional[int] = None, + return_token_timestamps: Optional[bool] = None, + return_segments: bool = False, + attention_mask: Optional[torch.Tensor] = None, + time_precision: int = 0.02, + return_dict_in_generate: Optional[bool] = None, + **kwargs, + ): + if "inputs" in kwargs: + input_features = kwargs.pop("inputs") + logging.warn( + "The input name `inputs` is deprecated. Please make sure to use `input_features` instead.", + FutureWarning, + ) + + return_dict_in_generate = ( + return_dict_in_generate + if return_dict_in_generate is not None + else self.generation_config.return_dict_in_generate + ) + + if generation_config is None: + generation_config = copy.deepcopy(self.generation_config) + + input_stride = ( + 1 * 2 + ) # NOTE: replaced from `self.model.encoder.conv1.stride[0] * self.model.encoder.conv2.stride[0]` + if num_segment_frames is None: + num_segment_frames = input_stride * self.config.max_source_positions + + # 1. Check whether we're in shortform or longform mode + if input_features is not None: + total_input_frames = input_features.shape[-1] + elif "encoder_outputs" in kwargs: + encoder_outputs_shape = ( + kwargs["encoder_outputs"][0].shape + if isinstance(kwargs["encoder_outputs"], BaseModelOutput) + else kwargs["encoder_outputs"].shape + ) + total_input_frames = encoder_outputs_shape[1] * input_stride + else: + raise ValueError("Make sure to provide either `input_features` or `encoder_outputs` to `generate`.") + + is_shortform = total_input_frames <= num_segment_frames + + # 2. Make sure the generation config is correctly set depending on whether timestamps are to be returned or not + if return_timestamps is True: + if not hasattr(generation_config, "no_timestamps_token_id"): + raise ValueError( + "You are trying to return timestamps, but the generation config is not properly set. " + "Make sure to initialize the generation config with the correct attributes that are needed such as `no_timestamps_token_id`. " + "For more details on how to generate the approtiate config, refer to https://github.com/huggingface/transformers/issues/21878#issuecomment-1451902363" + ) + generation_config.return_timestamps = return_timestamps + elif not is_shortform: + if return_timestamps is False: + raise ValueError( + "You have passed more than 3000 mel input features (> 30 seconds) which automatically enables long-form generation which " + "requires the model to predict timestamp tokens. Please either pass `return_timestamps=True` or make sure to pass no more than 3000 mel input features." + ) + + if not hasattr(generation_config, "no_timestamps_token_id"): + raise ValueError( + "You have passed more than 3000 mel input features (> 30 seconds) which automatically enables long-form generation which " + "requires the generation config to have `no_timestamps_token_id` correctly. " + "Make sure to initialize the generation config with the correct attributes that are needed such as `no_timestamps_token_id`. " + "For more details on how to generate the approtiate config, refer to https://github.com/huggingface/transformers/issues/21878#issuecomment-1451902363" + "or make sure to pass no more than 3000 mel input features." + ) + + logger.info("Setting `return_timestamps=True` for long-form generation.") + generation_config.return_timestamps = True + else: + generation_config.return_timestamps = False + + # 3. Make sure to correctly set language-related parameters + if is_multilingual is not None: + if not hasattr(generation_config, "is_multilingual"): + raise ValueError( + "The generation config is outdated and is thus not compatible with the `is_multilingual` argument " + "to `generate`. Please update the generation config as per the instructions " + "https://github.com/huggingface/transformers/issues/25084#issuecomment-1664398224" + ) + generation_config.is_multilingual = is_multilingual + + if hasattr(generation_config, "is_multilingual") and not generation_config.is_multilingual: + if task is not None or language is not None: + raise ValueError( + "Cannot specify `task` or `language` for an English-only model. If the model is intended to be " + "multilingual, pass `is_multilingual=True` to generate, or update the generation config." + ) + + if language is not None: + if not hasattr(generation_config, "lang_to_id"): + raise ValueError( + "The generation config is outdated and is thus not compatible with the `language` argument " + "to `generate`. Either set the language using the `forced_decoder_ids` in the model config, " + "or update the generation config as per the instructions https://github.com/huggingface/transformers/issues/25084#issuecomment-1664398224" + ) + language = language.lower() + generation_config.language = language + if task is not None: + if not hasattr(generation_config, "task_to_id"): + raise ValueError( + "The generation config is outdated and is thus not compatible with the `task` argument " + "to `generate`. Either set the task using the `forced_decoder_ids` in the model config, " + "or update the generation config as per the instructions https://github.com/huggingface/transformers/issues/25084#issuecomment-1664398224" + ) + generation_config.task = task + + # 4. Add forced decoder ids depending on passed `language`, `task`,`prompt_ids`, `return_token_timestamps` and `return_timestamps` + forced_decoder_ids = None + # Legacy code for backward compatibility + if hasattr(self.config, "forced_decoder_ids") and self.config.forced_decoder_ids is not None: + forced_decoder_ids = self.config.forced_decoder_ids + elif ( + hasattr(self.generation_config, "forced_decoder_ids") + and self.generation_config.forced_decoder_ids is not None + ): + forced_decoder_ids = self.generation_config.forced_decoder_ids + else: + forced_decoder_ids = kwargs.get("forced_decoder_ids", None) + + if task is not None or language is not None or (forced_decoder_ids is None and prompt_ids is not None): + forced_decoder_ids = [] + if hasattr(generation_config, "language"): + if generation_config.language in generation_config.lang_to_id.keys(): + language_token = generation_config.language + elif generation_config.language in TO_LANGUAGE_CODE.keys(): + language_token = f"<|{TO_LANGUAGE_CODE[generation_config.language]}|>" + elif generation_config.language in TO_LANGUAGE_CODE.values(): + language_token = f"<|{generation_config.language}|>" + else: + is_language_code = len(generation_config.language) == 2 + raise ValueError( + f"Unsupported language: {generation_config.language}. Language should be one of:" + f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}." + ) + forced_decoder_ids.append((1, generation_config.lang_to_id[language_token])) + else: + forced_decoder_ids.append((1, None)) # automatically detect the language + + if hasattr(generation_config, "task"): + if generation_config.task in TASK_IDS: + forced_decoder_ids.append((2, generation_config.task_to_id[generation_config.task])) + else: + raise ValueError( + f"The `{generation_config.task}`task is not supported. The task should be one of `{TASK_IDS}`" + ) + elif hasattr(generation_config, "task_to_id"): + forced_decoder_ids.append((2, generation_config.task_to_id["transcribe"])) # defaults to transcribe + if hasattr(generation_config, "no_timestamps_token_id") and not generation_config.return_timestamps: + idx = forced_decoder_ids[-1][0] + 1 if forced_decoder_ids else 1 + forced_decoder_ids.append((idx, generation_config.no_timestamps_token_id)) + + if forced_decoder_ids is not None: + generation_config.forced_decoder_ids = forced_decoder_ids + + if prompt_ids is not None: + if kwargs.get("decoder_start_token_id") is not None: + raise ValueError( + "When specifying `prompt_ids`, you cannot also specify `decoder_start_token_id` as it gets overwritten." + ) + prompt_ids = prompt_ids.tolist() + decoder_start_token_id, *text_prompt_ids = prompt_ids + # Slicing the text prompt ids in a manner consistent with the OpenAI implementation + # to accomodate context space for the prefix (see https://github.com/openai/whisper/blob/c09a7ae299c4c34c5839a76380ae407e7d785914/whisper/decoding.py#L599) + text_prompt_ids = text_prompt_ids[-self.config.max_target_positions // 2 - 1 :] + # Set the decoder_start_token_id to <|startofprev|> + kwargs.update({"decoder_start_token_id": decoder_start_token_id}) + + # If the user passes `max_new_tokens`, increase its number to account for the prompt + if kwargs.get("max_new_tokens", None) is not None: + kwargs["max_new_tokens"] += len(text_prompt_ids) + if kwargs["max_new_tokens"] >= self.config.max_target_positions: + raise ValueError( + f"The length of the sliced `prompt_ids` is {len(text_prompt_ids)}, and the `max_new_tokens` " + f"{kwargs['max_new_tokens'] - len(text_prompt_ids)}. Thus, the combined length of the sliced " + f"`prompt_ids` and `max_new_tokens` is: {kwargs['max_new_tokens']}. This exceeds the " + f"`max_target_positions` of the Whisper model: {self.config.max_target_positions}. " + "You should either reduce the length of your prompt, or reduce the value of `max_new_tokens`, " + f"so that their combined length is less that {self.config.max_target_positions}." + ) + + # Reformat the forced_decoder_ids to incorporate the prompt + non_prompt_forced_decoder_ids = ( + kwargs.pop("forced_decoder_ids", None) or generation_config.forced_decoder_ids + ) + forced_decoder_ids = [ + *text_prompt_ids, + generation_config.decoder_start_token_id, + *[token for _rank, token in non_prompt_forced_decoder_ids], + ] + forced_decoder_ids = [(rank + 1, token) for rank, token in enumerate(forced_decoder_ids)] + generation_config.forced_decoder_ids = forced_decoder_ids + + if return_token_timestamps: + kwargs["output_attentions"] = True + return_dict_in_generate = True + + if getattr(generation_config, "task", None) == "translate": + logger.warning("Token-level timestamps may not be reliable for task 'translate'.") + if not hasattr(generation_config, "alignment_heads"): + raise ValueError( + "Model generation config has no `alignment_heads`, token-level timestamps not available. " + "See https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a on how to add this property to the generation config." + ) + + if kwargs.get("num_frames") is not None: + generation_config.num_frames = kwargs.pop("num_frames") + + if generation_config.return_timestamps is True: + last_forced_decoder_ids = ( + generation_config.forced_decoder_ids[-1][-1] + if hasattr(self.config, "forced_decoder_ids") and self.config.forced_decoder_ids + else None + ) + if last_forced_decoder_ids == self.generation_config.no_timestamps_token_id: + # remove no_timestamp to be forcefully generated if we want to return timestamps + # this is also important to make sure `WhisperTimeStampLogitsProcessor` functions correctly + forced_decoder_ids = generation_config.forced_decoder_ids[:-1] + # Make sure that if list is empty we set it to None + generation_config.forced_decoder_ids = None if len(forced_decoder_ids) == 0 else forced_decoder_ids + + timestamp_processor = [WhisperTimeStampLogitsProcessor(generation_config)] + logits_processor = ( + timestamp_processor if logits_processor is None else timestamp_processor + logits_processor + ) + + # 5. If we're in shortform mode, simple generate the whole input at once and return the output + if is_shortform: + outputs = super().generate( + input_features, + generation_config, + logits_processor, + stopping_criteria, + prefix_allowed_tokens_fn, + synced_gpus, + return_dict_in_generate=return_dict_in_generate, + **kwargs, + ) + + if return_token_timestamps and hasattr(generation_config, "alignment_heads"): + num_frames = getattr(generation_config, "num_frames", None) + outputs["token_timestamps"] = self._extract_token_timestamps( + outputs, generation_config.alignment_heads, num_frames=num_frames + ) + + return outputs + + # 6. Else we're in longform mode which is more complex. We need to chunk the audio input depending on when the model generated + # timestamp tokens + # 6.1 Set running parameters for while loop + if not return_segments and return_dict_in_generate: + raise ValueError( + "Make sure to set `return_segments=True` to return generation outputs as part of the `'segments' key.`" + ) + + # if input is longer than 30 seconds we default to long-form generation + timestamp_begin = self.generation_config.no_timestamps_token_id + 1 + # input stride is mel frames per encoder output vector which is the product of all conv strides + batch_size = input_features.shape[0] + + if batch_size > 1 and attention_mask is None: + raise ValueError( + "When doing long-form audio transcription, make sure to pass an `attention_mask`. You can retrieve the `attention_mask` by doing `processor(audio, ..., return_attention_mask=True)` " + ) + elif batch_size > 1: + max_frames = attention_mask.sum(-1).cpu().to(torch.long) + seek = torch.zeros((batch_size,), dtype=torch.long) + else: + max_frames = torch.ones((1,), dtype=torch.long) * total_input_frames + seek = torch.zeros((1,), dtype=torch.long) + + current_segments = [[] for _ in range(batch_size)] + cur_to_prev_index_map = list(range(batch_size)) + + # batch size can decrease during the run + cur_bsz = prev_bsz = batch_size + + # 6.2 Transcribe audio until we reach the end of all input audios + while (seek < max_frames).any(): + prev_bsz = cur_bsz + + # 6.3 NOTE: When in longform transcription mode and batch size > 1 we need to dynamically reduce the batch size during the loop + # in case one audio finished earlier than another one. Thus, we need to keep a table of "previous-index-2-current-index" in order + # to know which original audio is being decoded + new_cur_to_prev_index_map = [] + for i in range(prev_bsz): + prev_i = cur_to_prev_index_map[i] + if seek[prev_i] >= max_frames[prev_i]: + cut_index = i + (cur_bsz - prev_bsz) + cur_bsz -= 1 + input_features = torch.cat([input_features[:cut_index], input_features[cut_index + 1 :]], dim=0) + else: + # cut out index that goes away + new_cur_to_prev_index_map.append(prev_i) + + # 6.4 Set updated index map, duration of previously decoded chunks and number of max frames of current decoding chunk + cur_to_prev_index_map = new_cur_to_prev_index_map + time_offset = seek * time_precision / input_stride + seek_num_frames = (max_frames - seek).clamp(max=num_segment_frames) + + # 6.5 Make sure that all inputs are padded to the same input length + segment_input = [] + for i in range(cur_bsz): + prev_i = cur_to_prev_index_map[i] + segment_input_slice = input_features[ + i : i + 1, :, seek[prev_i] : seek[prev_i] + seek_num_frames[prev_i] + ] + + if segment_input_slice.shape[-1] < num_segment_frames: + # pad to 3000 if necessary + segment_input_slice = torch.nn.functional.pad( + segment_input_slice, pad=(0, num_segment_frames - segment_input_slice.shape[-1]) + ) + + segment_input.append(segment_input_slice) + + segment_input = torch.cat(segment_input, dim=0) + + # 6.6 Batch generate current chunk + seek_outputs = super().generate( + segment_input, + generation_config, + logits_processor, + stopping_criteria, + prefix_allowed_tokens_fn, + synced_gpus, + return_dict_in_generate=return_dict_in_generate, + **kwargs, + ) + + if return_token_timestamps and hasattr(generation_config, "alignment_heads"): + num_frames = getattr(generation_config, "num_frames", None) + seek_outputs["token_timestamps"] = self._extract_token_timestamps( + seek_outputs, generation_config.alignment_heads, num_frames=num_frames + ) + + if return_dict_in_generate: + seek_sequences = seek_outputs["sequences"] + seek_outputs = [ + {k: v[i] for k, v in seek_outputs.items()} + for i in range(next(iter(seek_outputs.values())).size(0)) + ] + else: + seek_sequences = seek_outputs + + # 6.7 Loop over each decoded audio individually as each decoding can be of a different length + for i, seek_sequence in enumerate(seek_sequences): + prev_i = cur_to_prev_index_map[i] + + # make sure we cut a predicted EOS token if we are not finished with the generation yet + is_not_final = (seek[prev_i] + num_segment_frames) < max_frames[prev_i] + if is_not_final and seek_sequence[-1] == self.generation_config.eos_token_id: + seek_sequence = seek_sequence[:-1] + + # remove all padding tokens + if seek_sequence[-1] == self.generation_config.pad_token_id: + num_paddings = (seek_sequence == self.generation_config.pad_token_id).sum() + seek_sequence = seek_sequence[:-num_paddings] + + segments, segment_offset = self._retrieve_segment( + seek_sequence=seek_sequence, + seek_outputs=seek_outputs, + time_offset=time_offset, + timestamp_begin=timestamp_begin, + seek_num_frames=seek_num_frames, + cur_bsz=cur_bsz, + time_precision=time_precision, + input_stride=input_stride, + prev_idx=prev_i, + idx=i, + ) + + current_segments[prev_i] += segments + seek[prev_i] += segment_offset + + # 7. Once all segments are added to the list of all segments, called `current_segments`, we extract the predicted + # output tokens from the list of dicts. If we use batch size > 1, we make sure to pad the output + sequences = [] + max_total_length = 0 + for current_segment_list in current_segments: + sequences.append(torch.cat([d["tokens"] for d in current_segment_list], dim=-1)) + max_total_length = max(max_total_length, len(sequences[-1])) + + for i in range(batch_size): + sequences[i] = torch.nn.functional.pad( + sequences[i], pad=(0, max_total_length - len(sequences[i])), value=self.generation_config.pad_token_id + ) + + sequences = torch.stack(sequences, dim=0) + + # 8. If we return all segments, the predicted output sequences are put under `"sequences"`. + if return_segments: + return {"sequences": sequences, "segments": current_segments} + + return sequences diff --git a/optimum/intel/openvino/modeling_timm.py b/optimum/intel/openvino/modeling_timm.py index 044e8bd3b6..2b20a6a746 100644 --- a/optimum/intel/openvino/modeling_timm.py +++ b/optimum/intel/openvino/modeling_timm.py @@ -163,7 +163,7 @@ def from_pretrained( pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs, ): - timm_config_dict, _ = load_model_config_from_hf(pretrained_model_name_or_path) + timm_config_dict = load_model_config_from_hf(pretrained_model_name_or_path)[0] _, im_h, im_w = timm_config_dict.get("input_size", [3, 224, 224]) diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index bcc7c2908b..acdfb4a324 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -39,7 +39,6 @@ from ...exporters.openvino import export, export_pytorch_via_onnx from ..utils.constant import _TASK_ALIASES -from ..utils.modeling_utils import patch_decoder_attention_mask from .configuration import OVConfig from .modeling_base import OVBaseModel from .modeling_decoder import OVBaseDecoderModel @@ -51,6 +50,14 @@ ) +COMPRESSION_OPTIONS = { + "int8": {"mode": nncf.CompressWeightsMode.INT8}, + "int4_sym_g128": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 128}, + "int4_asym_g128": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 128}, + "int4_sym_g64": {"mode": nncf.CompressWeightsMode.INT4_SYM, "group_size": 64}, + "int4_asym_g64": {"mode": nncf.CompressWeightsMode.INT4_ASYM, "group_size": 64}, +} + register_module(ignored_algorithms=[])(Conv1D) core = Core() @@ -165,7 +172,6 @@ def quantize( if save_directory is None: # TODO : can be set to self.model.config.name_or_path for OVModels when not provided raise ValueError("`save_directory` needs to be specified") - if weights_only: if calibration_dataset is not None: logger.warning( @@ -186,6 +192,7 @@ def quantize( data_collator, remove_unused_columns, weights_only, + quantization_config, **kwargs, ) elif isinstance(self.model, OVBaseModel): @@ -212,6 +219,14 @@ def quantize( else: raise TypeError(f"Unsupported model type: {type(self.model)}") + def _get_compression_options(self, config: OVConfig): + options = {} + if config is not None and "type" in config.compression: + options = COMPRESSION_OPTIONS[config.compression["type"]] + if "ratio" in config.compression: + options["ratio"] = config.compression["ratio"] + return options + def _quantize_ovbasemodel( self, calibration_dataset: Dataset, @@ -256,13 +271,15 @@ def _quantize_ovcausallm( data_collator: Optional[DataCollator] = None, remove_unused_columns: bool = True, weights_only: bool = False, + quantization_config: OVConfig = None, **kwargs, ): save_directory = Path(save_directory) save_directory.mkdir(parents=True, exist_ok=True) if weights_only: - self.model.model = nncf.compress_weights(self.model.model) + options = self._get_compression_options(quantization_config) + self.model.model = nncf.compress_weights(self.model.model, **options) self.model.save_pretrained(save_directory) return @@ -394,9 +411,10 @@ def _quantize_torchmodel( task = self.task model = self.model self.model.config.save_pretrained(save_directory) - model = patch_decoder_attention_mask(model) - if task == "text-generation": - onnx_config = onnx_config_class(model.config, use_past=model.config.use_cache) + if task.startswith("text-generation"): + onnx_config = onnx_config_class( + model.config, use_past=model.config.use_cache, use_past_in_inputs=model.config.use_cache + ) else: onnx_config = onnx_config_class(model.config) diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py index 0bba054ad3..f5badac7b6 100644 --- a/optimum/intel/openvino/trainer.py +++ b/optimum/intel/openvino/trainer.py @@ -16,6 +16,7 @@ import io import math import os +import shutil import sys import time from collections import defaultdict @@ -23,8 +24,15 @@ from pathlib import Path from typing import Callable, Dict, List, Optional, Tuple, Type, Union + +# Integrations must be imported before ML frameworks: +# isort: off +from transformers.integrations import hp_params +from transformers.integrations.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_available + +# isort: on + import openvino -import openvino.runtime import torch import torch.distributed as dist import torch.nn.functional as F @@ -46,40 +54,39 @@ compress_quantize_weights_transformation, ) from openvino.runtime import Core, PartialShape, save_model +from packaging import version +from torch import nn from torch.onnx import export as onnx_export from torch.utils._pytree import tree_map -from torch.utils.data import DataLoader, Dataset, RandomSampler -from torch.utils.data.distributed import DistributedSampler -from tqdm.auto import tqdm +from torch.utils.data import Dataset, RandomSampler from transformers import Trainer from transformers.data.data_collator import DataCollator from transformers.debug_utils import DebugOption, DebugUnderflowOverflow -from transformers.deepspeed import deepspeed_init -from transformers.integrations import hp_params from transformers.modeling_utils import PreTrainedModel, unwrap_model from transformers.pytorch_utils import is_torch_less_than_1_11 from transformers.tokenization_utils_base import PreTrainedTokenizerBase from transformers.trainer import TRAINER_STATE_NAME, TRAINING_ARGS_NAME from transformers.trainer_callback import TrainerCallback, TrainerState -from transformers.trainer_pt_utils import IterableDatasetShard +from transformers.trainer_pt_utils import get_dataloader_sampler, get_model_param_count from transformers.trainer_utils import ( EvalPrediction, HPSearchBackend, - ShardedDDPOption, TrainOutput, has_length, speed_metrics, ) +from transformers.training_args import ParallelMode from transformers.utils import ( WEIGHTS_NAME, + is_accelerate_available, is_apex_available, is_sagemaker_mp_enabled, is_torch_tpu_available, logging, ) +from optimum.exporters import TasksManager from optimum.exporters.onnx import OnnxConfig -from optimum.exporters.tasks import TasksManager from ..utils.constant import _TASK_ALIASES from ..utils.import_utils import is_transformers_version @@ -95,6 +102,22 @@ ) +if is_accelerate_available(): + from accelerate import __version__ as accelerate_version + from accelerate import skip_first_batches + + if version.parse(accelerate_version) > version.parse("0.20.3"): + pass + DATA_SAMPLERS = [RandomSampler] + if version.parse(accelerate_version) > version.parse("0.23.0"): + from accelerate.data_loader import SeedableRandomSampler + + DATA_SAMPLERS += [SeedableRandomSampler] + + if is_deepspeed_available(): + pass + + if is_apex_available(): from apex import amp @@ -171,6 +194,8 @@ def __init__( task: Optional[str] = None, feature: Optional[str] = None, ): + self.neftune_noise_alpha = None + super().__init__( model, args, @@ -244,7 +269,12 @@ def _set_signature_columns_if_needed(self): def _inner_training_loop( self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None ): + self.accelerator.free_memory() self._train_batch_size = batch_size + + if self.args.auto_find_batch_size: + self.state.train_batch_size = self._train_batch_size + logger.debug(f"Currently training with a batch size of: {self._train_batch_size}") # Data loader and number of training steps train_dataloader = self.get_train_dataloader() @@ -252,9 +282,10 @@ def _inner_training_loop( # number of training epochs: num_train_epochs # number of training steps per epoch: num_update_steps_per_epoch # total number of training steps to execute: max_steps - total_train_batch_size = args.train_batch_size * args.gradient_accumulation_steps * args.world_size + total_train_batch_size = self._train_batch_size * args.gradient_accumulation_steps * args.world_size len_dataloader = None + num_train_tokens = None if has_length(train_dataloader): len_dataloader = len(train_dataloader) num_update_steps_per_epoch = len_dataloader // args.gradient_accumulation_steps @@ -268,10 +299,16 @@ def _inner_training_loop( # May be slightly incorrect if the last batch in the training dataloader has a smaller size but it's # the best we can do. num_train_samples = args.max_steps * total_train_batch_size + if args.include_tokens_per_second: + num_train_tokens = ( + self.num_tokens(train_dataloader, args.max_steps) * args.gradient_accumulation_steps + ) else: max_steps = math.ceil(args.num_train_epochs * num_update_steps_per_epoch) num_train_epochs = math.ceil(args.num_train_epochs) num_train_samples = self.num_examples(train_dataloader) * args.num_train_epochs + if args.include_tokens_per_second: + num_train_tokens = self.num_tokens(train_dataloader) * args.num_train_epochs elif args.max_steps > 0: # Rely on max_steps when dataloader does not have a working size max_steps = args.max_steps # Setting a very large number of epochs so we go as many times as necessary over the iterator. @@ -279,6 +316,8 @@ def _inner_training_loop( num_update_steps_per_epoch = max_steps num_examples = total_train_batch_size * args.max_steps num_train_samples = args.max_steps * total_train_batch_size + if args.include_tokens_per_second: + num_train_tokens = self.num_tokens(train_dataloader, args.max_steps) * args.gradient_accumulation_steps else: raise ValueError( "args.max_steps must be set to a positive value if dataloader does not have a length, was" @@ -287,7 +326,7 @@ def _inner_training_loop( if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug: if self.args.n_gpu > 1: - # torch.nn.DataParallel(model) replicates the model, creating new variables and module + # nn.DataParallel(model) replicates the model, creating new variables and module # references registered here no longer work on other gpus, breaking the module raise ValueError( "Currently --debug underflow_overflow is not supported under DP. Please use DDP" @@ -296,30 +335,51 @@ def _inner_training_loop( else: debug_overflow = DebugUnderflowOverflow(self.model) # noqa - delay_optimizer_creation = ( - self.sharded_ddp is not None - and self.sharded_ddp != ShardedDDPOption.SIMPLE - or is_sagemaker_mp_enabled() - or self.fsdp is not None + is_fsdp_xla_enabled = ( + self.is_fsdp_xla_enabled if is_transformers_version(">=", "4.36.0") else self.fsdp is not None ) - if args.deepspeed: - deepspeed_engine, optimizer, lr_scheduler = deepspeed_init( - self, num_training_steps=max_steps, resume_from_checkpoint=resume_from_checkpoint - ) - self.model = deepspeed_engine.module - self.model_wrapped = deepspeed_engine - self.deepspeed = deepspeed_engine - self.optimizer = optimizer - self.lr_scheduler = lr_scheduler - elif not delay_optimizer_creation: + delay_optimizer_creation = is_sagemaker_mp_enabled() or is_fsdp_xla_enabled or self.is_fsdp_enabled + + # We need to reset the scheduler, as its parameters may be different on subsequent calls + if self._created_lr_scheduler: + self.lr_scheduler = None + self._created_lr_scheduler = False + + if self.is_deepspeed_enabled: + self.optimizer, self.lr_scheduler = deepspeed_init(self, num_training_steps=max_steps) + + if not delay_optimizer_creation: self.create_optimizer_and_scheduler(num_training_steps=max_steps) self.state = TrainerState() self.state.is_hyper_param_search = trial is not None + self.state.train_batch_size = self._train_batch_size + + # Compute absolute values for logging, eval, and save if given as ratio + if args.logging_steps is not None: + if args.logging_steps < 1: + self.state.logging_steps = math.ceil(max_steps * args.logging_steps) + else: + self.state.logging_steps = args.logging_steps + if args.eval_steps is not None: + if args.eval_steps < 1: + self.state.eval_steps = math.ceil(max_steps * args.eval_steps) + else: + self.state.eval_steps = args.eval_steps + if args.save_steps is not None: + if args.save_steps < 1: + self.state.save_steps = math.ceil(max_steps * args.save_steps) + else: + self.state.save_steps = args.save_steps # Activate gradient checkpointing if needed if args.gradient_checkpointing: - self.model.gradient_checkpointing_enable() + if args.gradient_checkpointing_kwargs is None: + gradient_checkpointing_kwargs = {} + else: + gradient_checkpointing_kwargs = args.gradient_checkpointing_kwargs + + self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs) if is_transformers_version("<", "4.29.0"): is_distributed = self.args.local_rank != -1 @@ -333,31 +393,67 @@ def _inner_training_loop( model = self._wrap_model(self.model_wrapped) - if is_sagemaker_mp_enabled() and resume_from_checkpoint is not None: - self._load_from_checkpoint(resume_from_checkpoint, model) + # as the model is wrapped, don't use `accelerator.prepare` + # this is for unhandled cases such as + # FSDP-XLA, SageMaker MP/DP, DataParallel, IPEX + use_accelerator_prepare = True if model is self.model else False + + if delay_optimizer_creation: + if is_transformers_version("<", "4.36.0") and use_accelerator_prepare: + self.model = self.accelerator.prepare(self.model) + self.create_optimizer_and_scheduler(num_training_steps=max_steps) + + # prepare using `accelerator` prepare + if use_accelerator_prepare: + self.model.train() + if hasattr(self.lr_scheduler, "step"): + if self.use_apex: + model = self.accelerator.prepare(self.model) + else: + model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer) + else: + # to handle cases wherein we pass "DummyScheduler" such as when it is specified in DeepSpeed config. + model, self.optimizer, self.lr_scheduler = self.accelerator.prepare( + self.model, self.optimizer, self.lr_scheduler + ) + + if self.is_fsdp_enabled: + self.model = self.model_wrapped = model # for the rest of this function `model` is the outside model, whether it was wrapped or not if model is not self.model: self.model_wrapped = model - if delay_optimizer_creation: - self.create_optimizer_and_scheduler(num_training_steps=max_steps) + # backward compatibility + if self.is_deepspeed_enabled: + self.deepspeed = self.model_wrapped + + # ckpt loading + if resume_from_checkpoint is not None: + if self.is_deepspeed_enabled: + deepspeed_load_checkpoint(self.model_wrapped, resume_from_checkpoint) + elif is_sagemaker_mp_enabled() or self.is_fsdp_enabled: + self._load_from_checkpoint(resume_from_checkpoint, self.model_wrapped) # Check if saved optimizer or scheduler states exist self._load_optimizer_and_scheduler(resume_from_checkpoint) # important: at this point: # self.model is the Transformers Model - # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model), etc. + # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model), + # FSDP(Transformers Model), Dynamo Optimized Module(Transformers Model) etc. # Train! logger.info("***** Running training *****") - logger.info(f" Num examples = {num_examples}") - logger.info(f" Num Epochs = {num_train_epochs}") - logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}") - logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size}") + logger.info(f" Num examples = {num_examples:,}") + logger.info(f" Num Epochs = {num_train_epochs:,}") + logger.info(f" Instantaneous batch size per device = {self.args.per_device_train_batch_size:,}") + if self.args.per_device_train_batch_size != self._train_batch_size: + logger.info(f" Training with DataParallel so batch size has been adjusted to: {self._train_batch_size:,}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}") logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") - logger.info(f" Total optimization steps = {max_steps}") + logger.info(f" Total optimization steps = {max_steps:,}") + logger.info(f" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}") self.state.epoch = 0 start_time = time.time() @@ -382,20 +478,19 @@ def _inner_training_loop( logger.info(f" Continuing training from global step {self.state.global_step}") if not args.ignore_data_skip: logger.info( - f" Will skip the first {epochs_trained} epochs then the first {steps_trained_in_current_epoch} " - "batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` " - "flag to your launch command, but you will resume the training on data already seen by your model." + f" Will skip the first {epochs_trained} epochs then the first" + f" {steps_trained_in_current_epoch} batches in the first epoch." ) - if self.is_local_process_zero() and not args.disable_tqdm: - steps_trained_progress_bar = tqdm(total=steps_trained_in_current_epoch) - steps_trained_progress_bar.set_description("Skipping the first batches") # Update the references self.callback_handler.model = self.model self.callback_handler.optimizer = self.optimizer self.callback_handler.lr_scheduler = self.lr_scheduler self.callback_handler.train_dataloader = train_dataloader - self.state.trial_name = self.hp_name(trial) if self.hp_name is not None else None + if self.hp_name is not None and self._trial is not None: + # use self._trial because the SigOpt/Optuna hpo only call `_hp_search_setup(trial)` instead of passing trial + # parameter to Train when using DDP. + self.state.trial_name = self.hp_name(self._trial) if trial is not None: assignments = trial.assignments if self.hp_search_backend == HPSearchBackend.SIGOPT else trial self.state.trial_params = hp_params(assignments) @@ -408,6 +503,7 @@ def _inner_training_loop( self.state.is_local_process_zero = self.is_local_process_zero() self.state.is_world_process_zero = self.is_world_process_zero() + # tr_loss is a tensor to avoid synchronization of TPUs through .item() tr_loss = torch.tensor(0.0).to(args.device) self.compression_metrics = defaultdict(lambda: torch.tensor(0.0).to(args.device)) # _total_loss_scalar is updated everytime .item() has to be called on tr_loss and stores the sum of all losses @@ -420,31 +516,33 @@ def _inner_training_loop( # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point. if not args.ignore_data_skip: for epoch in range(epochs_trained): - is_random_sampler = hasattr(train_dataloader, "sampler") and isinstance( - train_dataloader.sampler, RandomSampler - ) + sampler = get_dataloader_sampler(train_dataloader) + sampler_kinds = [RandomSampler] + if version.parse(accelerate_version) > version.parse("0.23.0"): + sampler_kinds.append(SeedableRandomSampler) + is_random_sampler = isinstance(sampler, tuple(sampler_kinds)) if is_torch_less_than_1_11 or not is_random_sampler: # We just need to begin an iteration to create the randomization of the sampler. - # That was before PyTorch 1.11 however... for _ in train_dataloader: break else: - # Otherwise we need to call the whole sampler cause there is some random operation added + # Otherwise we need to call the whooooole sampler cause there is some random operation added # AT THE VERY END! - _ = list(train_dataloader.sampler) + sampler = sampler if sampler is not None else [] + _ = list(sampler) + total_batched_samples = 0 for epoch in range(epochs_trained, num_train_epochs): - if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler): - train_dataloader.sampler.set_epoch(epoch) - elif hasattr(train_dataloader, "dataset") and isinstance(train_dataloader.dataset, IterableDatasetShard): - train_dataloader.dataset.set_epoch(epoch) + epoch_iterator = train_dataloader + if hasattr(epoch_iterator, "set_epoch"): + epoch_iterator.set_epoch(epoch) # Reset the past mems state at the beginning of each epoch if necessary. if args.past_index >= 0: self._past = None steps_in_epoch = ( - len(train_dataloader) + len(epoch_iterator) if len_dataloader is not None else args.max_steps * args.gradient_accumulation_steps ) @@ -460,8 +558,33 @@ def _inner_training_loop( if epoch == epochs_trained and resume_from_checkpoint is not None and steps_trained_in_current_epoch == 0: self._load_rng_state(resume_from_checkpoint) + rng_to_sync = False + steps_skipped = 0 + if steps_trained_in_current_epoch > 0: + epoch_iterator = skip_first_batches(epoch_iterator, steps_trained_in_current_epoch) + steps_skipped = steps_trained_in_current_epoch + steps_trained_in_current_epoch = 0 + rng_to_sync = True + step = -1 - for step, inputs in enumerate(train_dataloader): + for step, inputs in enumerate(epoch_iterator): + total_batched_samples += 1 + + if is_transformers_version(">=", "4.36.0") and self.args.include_num_input_tokens_seen: + main_input_name = getattr(self.model, "main_input_name", "input_ids") + if main_input_name not in inputs: + logger.warning( + "Tried to track the number of tokens seen, however the current model is " + "not configured properly to know what item is the input. To fix this, add " + "a `main_input_name` attribute to the model class you are using." + ) + else: + self.state.num_input_tokens_seen += self.accelerator.gather(inputs[main_input_name]).numel() + + if rng_to_sync: + self._load_rng_state(resume_from_checkpoint) + rng_to_sync = False + # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 @@ -480,17 +603,14 @@ def _inner_training_loop( # Must be called at the beginning of each training step to prepare the compression method self.compression_controller.scheduler.step() + with self.accelerator.accumulate(model): + tr_loss_step = self.training_step(model, inputs) + if ( - ((step + 1) % args.gradient_accumulation_steps != 0) - and args.local_rank != -1 - and args._no_sync_in_gradient_accumulation + args.logging_nan_inf_filter + and not is_torch_tpu_available() + and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) ): - # Avoid unnecessary DDP synchronization since there will be no backward pass on this example. - with model.no_sync(): - tr_loss_step = self.training_step(model, inputs) - else: - tr_loss_step = self.training_step(model, inputs) - if args.logging_nan_inf_filter and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)): # if loss is nan or inf simply add the average of previous logged losses tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged) else: @@ -498,35 +618,40 @@ def _inner_training_loop( self.current_flos += float(self.floating_point_ops(inputs)) - # Optimizer step for deepspeed must be called on every step regardless of the value of gradient_accumulation_steps - if self.deepspeed: - self.deepspeed.step() + is_last_step_and_steps_less_than_grad_acc = ( + steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch + ) - if (step + 1) % args.gradient_accumulation_steps == 0 or ( + if ( + total_batched_samples % args.gradient_accumulation_steps == 0 + or # last step in epoch but step is always smaller than gradient_accumulation_steps - steps_in_epoch <= args.gradient_accumulation_steps - and (step + 1) == steps_in_epoch + is_last_step_and_steps_less_than_grad_acc ): + # the `or` condition of `is_last_step_and_steps_less_than_grad_acc` is not covered + # in accelerate. So, explicitly enable sync gradients to True in that case. + if is_last_step_and_steps_less_than_grad_acc: + self.accelerator.gradient_state._set_sync_gradients(True) + # Gradient clipping - if args.max_grad_norm is not None and args.max_grad_norm > 0 and not self.deepspeed: + if args.max_grad_norm is not None and args.max_grad_norm > 0: # deepspeed does its own clipping - if self.do_grad_scaling: + if getattr(self, "do_grad_scaling", False): # AMP: gradients need unscaling self.scaler.unscale_(self.optimizer) if is_sagemaker_mp_enabled() and args.fp16: self.optimizer.clip_master_grads(args.max_grad_norm) - elif hasattr(self.optimizer, "clip_grad_norm"): - # Some optimizers (like the sharded optimizer) have a specific way to do gradient clipping - self.optimizer.clip_grad_norm(args.max_grad_norm) - elif hasattr(model, "clip_grad_norm_"): - # Some models (like FullyShardedDDP) have a specific way to do gradient clipping - model.clip_grad_norm_(args.max_grad_norm) - else: + elif self.use_apex: # Revert to normal clipping otherwise, handling Apex or full precision - torch.nn.utils.clip_grad_norm_( - amp.master_params(self.optimizer) if self.use_apex else model.parameters(), + nn.utils.clip_grad_norm_( + amp.master_params(self.optimizer), + args.max_grad_norm, + ) + else: + self.accelerator.clip_grad_norm_( + model.parameters(), args.max_grad_norm, ) @@ -534,7 +659,7 @@ def _inner_training_loop( optimizer_was_run = True if self.deepspeed: pass # called outside the loop - elif self.do_grad_scaling: + elif getattr(self, "do_grad_scaling", False): scale_before = self.scaler.get_scale() self.scaler.step(self.optimizer) self.scaler.update() @@ -548,7 +673,7 @@ def _inner_training_loop( model.zero_grad() self.state.global_step += 1 - self.state.epoch = epoch + (step + 1) / steps_in_epoch + self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch self.control = self.callback_handler.on_step_end(args, self.state, self.control) self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval) @@ -559,7 +684,7 @@ def _inner_training_loop( break if step < 0: logger.warning( - "There seems to be not a single sample in your train_dataloader, stopping training at step" + "There seems to be not a single sample in your epoch_iterator, stopping training at step" f" {self.state.global_step}! This is expected if you're using an IterableDataset and set" f" num_steps ({max_steps}) higher than the number of available samples." ) @@ -577,8 +702,10 @@ def _inner_training_loop( logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") if args.load_best_model_at_end and self.state.best_model_checkpoint is not None: - # Wait for everyone to get here so we are sur the model has been saved by process 0. - if args.local_rank != -1: + # Wait for everyone to get here so we are sure the model has been saved by process 0. + if is_torch_tpu_available(): + xm.rendezvous("load_best_model_at_end") + elif args.parallel_mode == ParallelMode.DISTRIBUTED: dist.barrier() elif is_sagemaker_mp_enabled(): smp.barrier() @@ -589,7 +716,13 @@ def _inner_training_loop( self._total_loss_scalar += tr_loss.item() train_loss = self._total_loss_scalar / self.state.global_step - metrics = speed_metrics("train", start_time, num_samples=num_train_samples, num_steps=self.state.max_steps) + metrics = speed_metrics( + "train", + start_time, + num_samples=num_train_samples, + num_steps=self.state.max_steps, + num_tokens=num_train_tokens, + ) self.store_flos() metrics["total_flos"] = self.state.total_flos metrics["train_loss"] = train_loss @@ -600,8 +733,26 @@ def _inner_training_loop( self.log(metrics) + run_dir = self._get_output_dir(trial) + checkpoints_sorted = self._sorted_checkpoints(use_mtime=False, output_dir=run_dir) + + # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save. + if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1: + for checkpoint in checkpoints_sorted: + if not os.path.samefile(checkpoint, self.state.best_model_checkpoint): + logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit") + shutil.rmtree(checkpoint) + self.control = self.callback_handler.on_train_end(args, self.state, self.control) + # Wait for the checkpoint to be uploaded. + self._finish_current_push() + + # After training we make sure to retrieve back the original forward pass method + # for the embedding layer by removing the forward post hook. + if self.neftune_noise_alpha is not None: + self._deactivate_neftune(self.model) + return TrainOutput(self.state.global_step, train_loss, metrics) def compute_distillation_loss(self, inputs, student_outputs): @@ -705,12 +856,12 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None): if state_dict is None: state_dict = self.model.state_dict() if is_pretrained_model: - unwrapped_model.save_pretrained(output_dir, state_dict=state_dict) + unwrapped_model.save_pretrained(output_dir, state_dict=state_dict, safe_serialization=False) else: logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.") torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME)) else: - self.model.save_pretrained(output_dir, state_dict=state_dict) + self.model.save_pretrained(output_dir, state_dict=state_dict, safe_serialization=False) if self.tokenizer is not None: self.tokenizer.save_pretrained(output_dir) diff --git a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py index 72c2b8de10..78016ea71c 100644 --- a/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py +++ b/optimum/intel/utils/dummy_openvino_and_diffusers_objects.py @@ -68,3 +68,14 @@ def __init__(self, *args, **kwargs): @classmethod def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["openvino", "diffusers"]) + + +class OVLatentConsistencyModelPipeline(metaclass=DummyObject): + _backends = ["openvino", "diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino", "diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino", "diffusers"]) diff --git a/optimum/intel/utils/dummy_openvino_objects.py b/optimum/intel/utils/dummy_openvino_objects.py index a6d62652d5..9e17035d70 100644 --- a/optimum/intel/utils/dummy_openvino_objects.py +++ b/optimum/intel/utils/dummy_openvino_objects.py @@ -136,6 +136,17 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["openvino"]) +class OVModelForSpeechSeq2Seq(metaclass=DummyObject): + _backends = ["openvino"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["openvino"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["openvino"]) + + class OVModelForSequenceClassification(metaclass=DummyObject): _backends = ["openvino"] diff --git a/optimum/intel/utils/import_utils.py b/optimum/intel/utils/import_utils.py index 895a55270f..8eeae7cc5a 100644 --- a/optimum/intel/utils/import_utils.py +++ b/optimum/intel/utils/import_utils.py @@ -29,6 +29,7 @@ STR_OPERATION_TO_FUNC = {">": op.gt, ">=": op.ge, "==": op.eq, "!=": op.ne, "<=": op.le, "<": op.lt} +_optimum_version = importlib_metadata.version("optimum") _transformers_available = importlib.util.find_spec("transformers") is not None _transformers_version = "N/A" @@ -80,7 +81,10 @@ try: _openvino_version = importlib_metadata.version("openvino") except importlib_metadata.PackageNotFoundError: - _openvino_available = False + try: + _openvino_version = importlib_metadata.version("openvino-nightly") + except importlib_metadata.PackageNotFoundError: + _openvino_available = False _nncf_available = importlib.util.find_spec("nncf") is not None @@ -185,6 +189,10 @@ def is_transformers_version(operation: str, version: str): return compare_versions(parse(_transformers_version), operation, version) +def is_optimum_version(operation: str, version: str): + return compare_versions(parse(_optimum_version), operation, version) + + def is_neural_compressor_version(operation: str, version: str): """ Compare the current Neural Compressor version to a given reference with an operation. @@ -242,6 +250,15 @@ def is_ipex_version(operation: str, version: str): return compare_versions(parse(_ipex_version), operation, version) +def is_timm_version(operation: str, version: str): + """ + Compare the current timm version to a given reference with an operation. + """ + if not _timm_available: + return False + return compare_versions(parse(_timm_version), operation, version) + + DIFFUSERS_IMPORT_ERROR = """ {0} requires the diffusers library but it was not found in your environment. You can install it with pip: `pip install diffusers`. Please note that you may need to restart your runtime after installation. diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py index 17abf1059e..1a3b6fbede 100644 --- a/optimum/intel/utils/modeling_utils.py +++ b/optimum/intel/utils/modeling_utils.py @@ -18,6 +18,9 @@ from transformers.modeling_utils import PreTrainedModel +MULTI_QUERY_ATTN_MODELS = {"falcon", "gpt_bigcode"} + + # Modified from transformers.models.bloom.modeling_bloom._make_causal_mask def _make_causal_mask( input_ids_shape: torch.Size, @@ -92,6 +95,40 @@ def _prepare_decoder_attention_mask(attention_mask, input_shape, inputs_embeds, return combined_attention_mask +# Modified from transformers.models.mistral.modeling_mistral._prepare_decoder_sliding_window_attention_mask +def _prepare_decoder_sliding_window_attention_mask( + attention_mask: torch.Tensor, + input_shape: Tuple[int, int], + inputs_embeds: torch.Tensor, + past_key_values_length: int, + sliding_window: int, +): + from transformers.models.mistral.modeling_mistral import _expand_mask, _make_sliding_window_causal_mask + + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + + combined_attention_mask = _make_sliding_window_causal_mask( + input_shape, + device=inputs_embeds.device, + dtype=inputs_embeds.dtype, + past_key_values_length=past_key_values_length, + sliding_window=sliding_window, + ) + + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to( + inputs_embeds.device + ) + combined_attention_mask = ( + expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask + ) + + return combined_attention_mask + + def patch_decoder_attention_mask(model: "PreTrainedModel"): """ Apply patch on decoder with past model forward to resolve first inference based on model architecture @@ -106,6 +143,8 @@ def patch_decoder_attention_mask(model: "PreTrainedModel"): model.transformer._prepare_attn_mask = _prepare_attn_mask elif model.config.model_type == "llama": model.model._prepare_decoder_attention_mask = _prepare_decoder_attention_mask + elif model.config.model_type == "mistral": + model.model._prepare_decoder_attention_mask = _prepare_decoder_sliding_window_attention_mask elif model.config.model_type in {"blenderbot-small", "blenderbot", "opt", "pegasus", "bart"}: model.model.decoder._prepare_decoder_attention_mask = _prepare_decoder_attention_mask return model diff --git a/optimum/intel/version.py b/optimum/intel/version.py index f8f59092d8..d34b917507 100644 --- a/optimum/intel/version.py +++ b/optimum/intel/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.12.0.dev0" +__version__ = "1.13.0.dev0" diff --git a/setup.py b/setup.py index 83185467b1..a838364c94 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ assert False, "Error: Could not open '%s' due %s\n" % (filepath, error) INSTALL_REQUIRE = [ - "optimum>=1.13.0", + "optimum>=1.14.0", "transformers>=4.20.0", "datasets>=1.4.0", "sentencepiece", @@ -43,10 +43,11 @@ "neural-compressor>=2.2.0", "onnx", "onnxruntime<1.15.0", + "transformers>=4.33.0", ], - "openvino": ["openvino>=2023.1.0", "onnx", "onnxruntime"], - "nncf": ["nncf>=2.6.0"], - "ipex": ["transformers<4.32.0", "intel-extension-for-pytorch<2.1.0", "onnx", "torch<2.1.0"], + "openvino": ["openvino>=2023.2", "onnx", "onnxruntime", "transformers>=4.33.0"], + "nncf": ["nncf>=2.7.0"], + "ipex": ["transformers<4.32.0", "intel-extension-for-pytorch", "onnx"], "diffusers": ["diffusers"], "quality": QUALITY_REQUIRE, "tests": TESTS_REQUIRE, diff --git a/tests/generation/test_modeling.py b/tests/generation/test_modeling.py index 0fd668ad8f..db36b924f4 100644 --- a/tests/generation/test_modeling.py +++ b/tests/generation/test_modeling.py @@ -20,6 +20,7 @@ from parameterized import parameterized from transformers import AutoModelForCausalLM, AutoTokenizer, PretrainedConfig, pipeline, set_seed +from optimum.exporters.onnx import MODEL_TYPES_REQUIRING_POSITION_IDS from optimum.intel.generation.modeling import TSModelForCausalLM @@ -28,6 +29,9 @@ "gptj": "hf-internal-testing/tiny-random-gptj", "gpt2": "hf-internal-testing/tiny-random-gpt2", "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel", + "mistral": "echarlaix/tiny-random-mistral", + "llama": "fxmarty/tiny-llama-fast-tokenizer", + "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", } SEED = 42 @@ -48,7 +52,11 @@ class ModelingIntegrationTest(unittest.TestCase): "gpt2", "gptj", "gpt_neo", + "mistral", + "llama", + # "gpt_bigcode", ) + GENERATION_LENGTH = 100 SPEEDUP_CACHE = 1.1 @@ -61,7 +69,12 @@ def test_compare_to_transformers(self, model_arch): trfs_model = AutoModelForCausalLM.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) tokens = tokenizer("This is a sample", return_tensors="pt") - outputs = model(**tokens) + + position_ids = None + if model_arch.replace("_", "-") in MODEL_TYPES_REQUIRING_POSITION_IDS: + input_shape = tokens["input_ids"].shape + position_ids = torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1]) + outputs = model(**tokens, position_ids=position_ids) self.assertIsInstance(outputs.logits, torch.Tensor) with torch.no_grad(): trfs_outputs = trfs_model(**tokens) @@ -71,7 +84,8 @@ def test_compare_to_transformers(self, model_arch): with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) loaded_model = TSModelForCausalLM.from_pretrained(tmpdirname) - loaded_model_outputs = loaded_model(**tokens) + loaded_model_outputs = loaded_model(**tokens, position_ids=position_ids) + self.assertTrue(torch.equal(outputs.logits, loaded_model_outputs.logits)) @parameterized.expand(SUPPORTED_ARCHITECTURES) @@ -120,7 +134,6 @@ def test_compare_with_and_without_past_key_values(self): model_id = MODEL_NAMES["gpt2"] tokenizer = AutoTokenizer.from_pretrained(model_id) tokens = tokenizer("This is a sample input", return_tensors="pt") - model_with_pkv = TSModelForCausalLM.from_pretrained(model_id, export=True, use_cache=True) # Warmup _ = model_with_pkv.generate(**tokens) @@ -136,6 +149,9 @@ def test_compare_with_and_without_past_key_values(self): outputs_model_without_pkv = model_without_pkv.generate( **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 ) + self.assertTrue(model_with_pkv.use_cache) + self.assertFalse(model_without_pkv.use_cache) + self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH) self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH) diff --git a/tests/neural_compressor/test_modeling.py b/tests/neural_compressor/test_modeling.py index fc2a310595..8098f011c5 100644 --- a/tests/neural_compressor/test_modeling.py +++ b/tests/neural_compressor/test_modeling.py @@ -19,6 +19,7 @@ import unittest import torch +from packaging.version import Version, parse from parameterized import parameterized from transformers import AutoTokenizer, pipeline, set_seed @@ -39,6 +40,7 @@ INCTrainer, ) from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS, WEIGHTS_NAME +from optimum.version import __version__ as _optimum_version os.environ["CUDA_VISIBLE_DEVICES"] = "" @@ -133,6 +135,7 @@ def test_pipeline(self, model_id, task): pipe(*inputs) + @unittest.skipIf(parse(_optimum_version) < Version("1.14.0"), "not supported, needs optimum>=v1.14.0") def test_compare_with_and_without_past_key_values(self): model_id = "echarlaix/tiny-random-gpt2-torchscript" tokenizer = AutoTokenizer.from_pretrained(model_id) diff --git a/tests/neural_compressor/test_onnx.py b/tests/neural_compressor/test_onnx.py index f5dc0b7c66..387c369dd1 100644 --- a/tests/neural_compressor/test_onnx.py +++ b/tests/neural_compressor/test_onnx.py @@ -54,7 +54,7 @@ def test_static_quantization(self, task, model_name, expected_quantized_matmuls) tokenizer.pad_token = tokenizer.eos_token quantizer = INCQuantizer.from_pretrained(model, task=task) calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=num_samples) - save_onnx_model = True + save_onnx_model = False op_type_dict = ( {"Embedding": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}}} if save_onnx_model diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index d2b9960258..b90490d610 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -16,7 +16,12 @@ from tempfile import TemporaryDirectory from parameterized import parameterized -from utils_tests import _ARCHITECTURES_TO_EXPECTED_INT8, MODEL_NAMES, get_num_quantized_nodes +from utils_tests import ( + _ARCHITECTURES_TO_EXPECTED_INT4_INT8, + _ARCHITECTURES_TO_EXPECTED_INT8, + MODEL_NAMES, + get_num_quantized_nodes, +) from optimum.exporters.openvino.__main__ import main_export from optimum.intel import ( # noqa @@ -57,9 +62,26 @@ class OVCLIExportTestCase(unittest.TestCase): ("stable-diffusion-xl", "stable-diffusion-xl-refiner"), ) - def _openvino_export(self, model_name: str, task: str, fp16: bool = False, int8: bool = False): + SUPPORTED_4BIT_ARCHITECTURES = (("text-generation-with-past", "opt125m"),) + + SUPPORTED_4BIT_OPTIONS = ["int4_sym_g128", "int4_asym_g128", "int4_sym_g64", "int4_asym_g64"] + + TEST_4BIT_CONFIGURATONS = [] + for arch in SUPPORTED_4BIT_ARCHITECTURES: + for option in SUPPORTED_4BIT_OPTIONS: + TEST_4BIT_CONFIGURATONS.append([arch[0], arch[1], option]) + + def _openvino_export( + self, model_name: str, task: str, compression_option: str = None, compression_ratio: float = None + ): with TemporaryDirectory() as tmpdir: - main_export(model_name_or_path=model_name, output=tmpdir, task=task, fp16=fp16, int8=int8) + main_export( + model_name_or_path=model_name, + output=tmpdir, + task=task, + compression_option=compression_option, + compression_ratio=compression_ratio, + ) @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_export(self, task: str, model_type: str): @@ -80,7 +102,7 @@ def test_exporters_cli(self, task: str, model_type: str): def test_exporters_cli_fp16(self, task: str, model_type: str): with TemporaryDirectory() as tmpdir: subprocess.run( - f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --fp16 {tmpdir}", + f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format fp16 {tmpdir}", shell=True, check=True, ) @@ -91,7 +113,7 @@ def test_exporters_cli_fp16(self, task: str, model_type: str): def test_exporters_cli_int8(self, task: str, model_type: str): with TemporaryDirectory() as tmpdir: subprocess.run( - f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --int8 {tmpdir}", + f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format int8 {tmpdir}", shell=True, check=True, ) @@ -110,5 +132,21 @@ def test_exporters_cli_int8(self, task: str, model_type: str): expected_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type] for i, model in enumerate(models): - _, num_int8 = get_num_quantized_nodes(model) + _, num_int8, _ = get_num_quantized_nodes(model) self.assertEqual(expected_int8[i], num_int8) + + @parameterized.expand(TEST_4BIT_CONFIGURATONS) + def test_exporters_cli_int4(self, task: str, model_type: str, option: str): + with TemporaryDirectory() as tmpdir: + subprocess.run( + f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format {option} {tmpdir}", + shell=True, + check=True, + ) + model_kwargs = {"use_cache": task.endswith("with-past")} if "generation" in task else {} + model = eval(_HEAD_TO_AUTOMODELS[task.replace("-with-past", "")]).from_pretrained(tmpdir, **model_kwargs) + + expected_int8, expected_int4 = _ARCHITECTURES_TO_EXPECTED_INT4_INT8[model_type] + _, num_int8, num_int4 = get_num_quantized_nodes(model) + self.assertEqual(expected_int8, num_int8) + self.assertEqual(expected_int4, num_int4) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index bcd1bdb903..dc33b39f2a 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -40,6 +40,7 @@ AutoModelForQuestionAnswering, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, + AutoModelForSpeechSeq2Seq, AutoModelForTokenClassification, AutoTokenizer, GenerationConfig, @@ -51,6 +52,7 @@ from transformers.onnx.utils import get_preprocessor from utils_tests import MODEL_NAMES +from optimum.exporters.onnx import MODEL_TYPES_REQUIRING_POSITION_IDS from optimum.intel import ( OVModelForAudioClassification, OVModelForAudioFrameClassification, @@ -64,6 +66,7 @@ OVModelForQuestionAnswering, OVModelForSeq2SeqLM, OVModelForSequenceClassification, + OVModelForSpeechSeq2Seq, OVModelForTokenClassification, OVStableDiffusionPipeline, ) @@ -111,6 +114,19 @@ def test_load_from_hub_and_save_model(self): self.assertIsInstance(loaded_model.config, PretrainedConfig) loaded_model_outputs = loaded_model(**tokens) + # Test that model caching is automatically enabled + openvino_cache_dir = loaded_model.model_save_dir / "model_cache" + self.assertTrue(openvino_cache_dir.is_dir()) + self.assertGreaterEqual(len(list(openvino_cache_dir.glob("*.blob"))), 1) + + # Test specifying ov_config with throughput hint and manual cache dir + manual_openvino_cache_dir = loaded_model.model_save_dir / "manual_model_cache" + ov_config = {"CACHE_DIR": str(manual_openvino_cache_dir), "PERFORMANCE_HINT": "THROUGHPUT"} + loaded_model = OVModelForSequenceClassification.from_pretrained(self.OV_MODEL_ID, ov_config=ov_config) + self.assertTrue(manual_openvino_cache_dir.is_dir()) + self.assertGreaterEqual(len(list(manual_openvino_cache_dir.glob("*.blob"))), 1) + self.assertEqual(loaded_model.request.get_property("PERFORMANCE_HINT").name, "THROUGHPUT") + with tempfile.TemporaryDirectory() as tmpdirname: loaded_model.save_pretrained(tmpdirname) folder_contents = os.listdir(tmpdirname) @@ -120,6 +136,7 @@ def test_load_from_hub_and_save_model(self): outputs = model(**tokens) self.assertTrue(torch.equal(loaded_model_outputs.logits, outputs.logits)) + del loaded_model del model gc.collect() @@ -276,6 +293,10 @@ def test_pipeline(self, model_arch): self.assertTrue(not model.is_dynamic) self.assertGreaterEqual(outputs[0]["score"], 0.0) self.assertIsInstance(outputs[0]["label"], str) + # Test that model caching was not automatically enabled for exported model + openvino_cache_dir = model.model_save_dir / "model_cache" + self.assertFalse(openvino_cache_dir.is_dir()) + del model del pipe gc.collect() @@ -449,6 +470,7 @@ def test_pipeline(self, model_arch): class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES = ( "bart", + "gpt_bigcode", "blenderbot", "blenderbot-small", "bloom", @@ -459,12 +481,12 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "gpt_neox", "llama", "marian", + # "mistral", "mpt", "opt", "pegasus", ) GENERATION_LENGTH = 100 - SPEEDUP_CACHE = 1.1 @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): @@ -477,7 +499,12 @@ def test_compare_to_transformers(self, model_arch): tokens = tokenizer( "This is a sample", return_tensors="pt", return_token_type_ids=False if model_arch == "llama" else None ) - ov_outputs = ov_model(**tokens) + position_ids = None + if model_arch.replace("_", "-") in MODEL_TYPES_REQUIRING_POSITION_IDS: + input_shape = tokens["input_ids"].shape + position_ids = torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1]) + ov_outputs = ov_model(**tokens, position_ids=position_ids) + self.assertTrue("logits" in ov_outputs) self.assertIsInstance(ov_outputs.logits, torch.Tensor) with torch.no_grad(): @@ -537,29 +564,17 @@ def test_compare_with_and_without_past_key_values(self): tokens = tokenizer("This is a sample input", return_tensors="pt") model_with_pkv = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=True) - # Warmup - _ = model_with_pkv.generate(**tokens) - with Timer() as with_pkv_timer: - outputs_model_with_pkv = model_with_pkv.generate( - **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 - ) - + outputs_model_with_pkv = model_with_pkv.generate( + **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 + ) model_without_pkv = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=False) - - # Warmup - _ = model_without_pkv.generate(**tokens) - with Timer() as without_pkv_timer: - outputs_model_without_pkv = model_without_pkv.generate( - **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 - ) + outputs_model_without_pkv = model_without_pkv.generate( + **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 + ) self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH) self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH) - self.assertTrue( - without_pkv_timer.elapsed / with_pkv_timer.elapsed > self.SPEEDUP_CACHE, - f"With pkv latency: {with_pkv_timer.elapsed:.3f} ms, without pkv latency: {without_pkv_timer.elapsed:.3f} ms," - f" speedup: {without_pkv_timer.elapsed / with_pkv_timer.elapsed:.3f}", - ) + del model_with_pkv del model_without_pkv gc.collect() @@ -1192,3 +1207,66 @@ def test_compare_with_and_without_past_key_values(self): del model_with_pkv del model_without_pkv gc.collect() + + +class OVModelForSpeechSeq2SeqIntegrationTest(unittest.TestCase): + SUPPORTED_ARCHITECTURES = ("whisper",) + + def _generate_random_audio_data(self): + np.random.seed(10) + t = np.linspace(0, 5.0, int(5.0 * 22050), endpoint=False) + # generate pure sine wave at 220 Hz + audio_data = 0.5 * np.sin(2 * np.pi * 220 * t) + return audio_data + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_compare_to_transformers(self, model_arch): + model_id = MODEL_NAMES[model_arch] + set_seed(SEED) + ov_model = OVModelForSpeechSeq2Seq.from_pretrained(model_id, export=True) + self.assertIsInstance(ov_model.config, PretrainedConfig) + transformers_model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id) + processor = get_preprocessor(model_id) + data = self._generate_random_audio_data() + features = processor.feature_extractor(data, return_tensors="pt") + + decoder_start_token_id = transformers_model.config.decoder_start_token_id + decoder_inputs = {"decoder_input_ids": torch.ones((1, 1), dtype=torch.long) * decoder_start_token_id} + + with torch.no_grad(): + transformers_outputs = transformers_model(**features, **decoder_inputs) + + for input_type in ["pt", "np"]: + features = processor.feature_extractor(data, return_tensors=input_type) + + if input_type == "np": + decoder_inputs = {"decoder_input_ids": np.ones((1, 1), dtype=np.int64) * decoder_start_token_id} + + ov_outputs = ov_model(**features, **decoder_inputs) + self.assertIn("logits", ov_outputs) + # Compare tensor outputs + self.assertTrue(torch.allclose(torch.Tensor(ov_outputs.logits), transformers_outputs.logits, atol=1e-3)) + + del transformers_model + del ov_model + gc.collect() + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_pipeline(self, model_arch): + model_id = MODEL_NAMES[model_arch] + model = OVModelForSpeechSeq2Seq.from_pretrained(model_id, export=True) + processor = get_preprocessor(model_id) + GenerationConfig.from_pretrained(model_id) + pipe = pipeline( + "automatic-speech-recognition", + model=model, + tokenizer=processor.tokenizer, + feature_extractor=processor.feature_extractor, + ) + data = self._generate_random_audio_data() + outputs = pipe(data) + self.assertIsInstance(outputs["text"], str) + + del pipe + del model + gc.collect() diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index c1ec95ea9b..c3378c08e6 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -63,7 +63,7 @@ class OVQuantizerTest(unittest.TestCase): # TODO : add models SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = ( (OVModelForSequenceClassification, "hf-internal-testing/tiny-random-bert", 32, 35), - (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 41, 22), + (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 41, 23), ) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS) @@ -93,7 +93,7 @@ def preprocess_function(examples, tokenizer): model = model_cls.from_pretrained(tmp_dir, file_name=file_name) # TODO: uncomment once move to a newer version of NNCF which has some fixes (addmm, baddmm) - # num_fake_quantize, num_int8 = get_num_quantized_nodes(model) + # num_fake_quantize, num_int8, _ = get_num_quantized_nodes(model) # self.assertEqual(expected_fake_quantize, num_fake_quantize) # self.assertEqual(expected_int8, num_int8) @@ -110,6 +110,8 @@ def preprocess_function(examples, tokenizer): def test_ovmodel_static_quantization(self, model_cls, model_name, expected_fake_quantize, expected_int8): task = model_cls.export_feature dataset_name, dataset_config_name, column_name = _TASK_TO_DATASET[task] + if "gpt2" in model_name: + expected_int8 -= 1 def preprocess_function(examples, tokenizer): return tokenizer(examples[column_name], padding="max_length", max_length=128, truncation=True) @@ -132,7 +134,7 @@ def preprocess_function(examples, tokenizer): model = model_cls.from_pretrained(tmp_dir) - num_fake_quantize, num_int8 = get_num_quantized_nodes(model) + num_fake_quantize, num_int8, _ = get_num_quantized_nodes(model) self.assertEqual(expected_fake_quantize, num_fake_quantize) self.assertEqual(expected_int8, num_int8) @@ -143,11 +145,13 @@ def preprocess_function(examples, tokenizer): class OVWeightCompressionTest(unittest.TestCase): # TODO : add models - SUPPORTED_ARCHITECTURES_WITH_EXPECTED_COMPRESSED_MATMULS = ( - (OVModelForSequenceClassification, "hf-internal-testing/tiny-random-bert", 70, 35), - (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 45, 22), + SUPPORTED_ARCHITECTURES_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS = ( + (OVModelForSequenceClassification, "hf-internal-testing/tiny-random-bert", 70, 70), + (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 45, 44), ) + SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS = ((OVModelForCausalLM, "opt125m", 82, 295),) + SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION = ( (OVModelForCausalLM, "gpt2"), (OVModelForMaskedLM, "bert"), @@ -162,7 +166,7 @@ class OVWeightCompressionTest(unittest.TestCase): (OVStableDiffusionXLPipeline, "stable-diffusion-xl"), ) - @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_COMPRESSED_MATMULS) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS) def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_int8, expected_ov_int8): task = model_cls.export_feature @@ -176,7 +180,7 @@ def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_i quantizer.quantize(save_directory=tmp_dir, weights_only=True) model = model_cls.from_pretrained(tmp_dir) - _, num_int8 = get_num_quantized_nodes(model) + _, num_int8, _ = get_num_quantized_nodes(model) self.assertEqual(expected_pt_int8, num_int8) tokens = tokenizer("This is a sample input", return_tensors="pt") @@ -187,8 +191,8 @@ def test_automodel_weight_compression(self, model_cls, model_name, expected_pt_i loaded_config = OVConfig.from_pretrained(tmp_dir) self.assertIsNotNone(loaded_config) - @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_COMPRESSED_MATMULS) - def test_ovmodel_weight_compression(self, model_cls, model_name, expected_pt_int8, expected_ov_int8): + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_8BIT_COMPRESSED_MATMULS) + def test_ovmodel_8bit_weight_compression(self, model_cls, model_name, expected_pt_int8, expected_ov_int8): task = model_cls.export_feature with tempfile.TemporaryDirectory() as tmp_dir: @@ -201,13 +205,40 @@ def test_ovmodel_weight_compression(self, model_cls, model_name, expected_pt_int quantizer.quantize(save_directory=tmp_dir, weights_only=True) model = model_cls.from_pretrained(tmp_dir) - _, num_int8 = get_num_quantized_nodes(model) + _, num_int8, _ = get_num_quantized_nodes(model) self.assertEqual(expected_ov_int8, num_int8) tokens = tokenizer("This is a sample input", return_tensors="pt") outputs = model(**tokens) self.assertTrue("logits" in outputs) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_4BIT_COMPRESSED_MATMULS) + def test_ovmodel_4bit_weight_compression(self, model_cls, model_name, expected_int8, expected_int4): + task = model_cls.export_feature + + with tempfile.TemporaryDirectory() as tmp_dir: + model_id = MODEL_NAMES[model_name] + transformers_model = model_cls.from_pretrained(model_id, export=True) + tokenizer = AutoTokenizer.from_pretrained(model_id) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + quantizer = OVQuantizer.from_pretrained(transformers_model, task=task) + quantizer.quantize( + save_directory=tmp_dir, + weights_only=True, + quantization_config=OVConfig(compression={"type": "int4_sym_g128", "ratio": 0.8}), + ) + model = model_cls.from_pretrained(tmp_dir) + + _, num_int8, num_int4 = get_num_quantized_nodes(model) + self.assertEqual(expected_int8, num_int8) + self.assertEqual(expected_int4, num_int4) + + tokens = tokenizer("This is a sample input", return_tensors="pt") + outputs = model(**tokens) + self.assertTrue("logits" in outputs) + @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION) def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type): model = model_cls.from_pretrained(MODEL_NAMES[model_type], export=True, load_in_8bit=True) @@ -222,7 +253,7 @@ def test_ovmodel_load_with_compressed_weights(self, model_cls, model_type): expected_ov_int8 = _ARCHITECTURES_TO_EXPECTED_INT8[model_type] for i, model in enumerate(models): - _, num_int8 = get_num_quantized_nodes(model) + _, num_int8, _ = get_num_quantized_nodes(model) self.assertEqual(expected_ov_int8[i], num_int8) @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION) @@ -238,7 +269,7 @@ def test_ovmodel_load_with_uncompressed_weights(self, model_cls, model_type): models = [model] for i, model in enumerate(models): - _, num_int8 = get_num_quantized_nodes(model) + _, num_int8, _ = get_num_quantized_nodes(model) self.assertEqual(0, num_int8) @@ -349,7 +380,7 @@ def compute_metrics(p): trainer.save_model() model = OVModelForSequenceClassification.from_pretrained(tmp_dir) - num_fake_quantize, num_int8 = get_num_quantized_nodes(model) + num_fake_quantize, num_int8, _ = get_num_quantized_nodes(model) self.assertEqual(expected_fake_quantize, num_fake_quantize) self.assertEqual(expected_int8, num_int8) diff --git a/tests/openvino/test_stable_diffusion.py b/tests/openvino/test_stable_diffusion.py index 0e2ea91e4c..1fce7c3fc8 100644 --- a/tests/openvino/test_stable_diffusion.py +++ b/tests/openvino/test_stable_diffusion.py @@ -28,10 +28,12 @@ from diffusers.utils import load_image from diffusers.utils.testing_utils import floats_tensor from openvino.runtime.ie_api import CompiledModel +from packaging.version import Version, parse from parameterized import parameterized from utils_tests import MODEL_NAMES, SEED from optimum.intel import ( + OVLatentConsistencyModelPipeline, OVStableDiffusionImg2ImgPipeline, OVStableDiffusionInpaintPipeline, OVStableDiffusionPipeline, @@ -50,6 +52,7 @@ ORTStableDiffusionXLImg2ImgPipeline, ORTStableDiffusionXLPipeline, ) +from optimum.utils.import_utils import _diffusers_version def _generate_inputs(batch_size=1): @@ -475,3 +478,68 @@ def generate_inputs(self, height=128, width=128, batch_size=1, input_type="np"): inputs["image"] = _create_image(height=height, width=width, batch_size=batch_size, input_type=input_type) inputs["strength"] = 0.75 return inputs + + +class OVLatentConsistencyModelPipelineTest(unittest.TestCase): + SUPPORTED_ARCHITECTURES = ("latent-consistency",) + MODEL_CLASS = OVLatentConsistencyModelPipeline + TASK = "text-to-image" + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @unittest.skipIf(parse(_diffusers_version) <= Version("0.21.4"), "not supported with this diffusers version") + def test_compare_to_diffusers(self, model_arch: str): + ov_pipeline = self.MODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) + self.assertIsInstance(ov_pipeline.text_encoder, OVModelTextEncoder) + self.assertIsInstance(ov_pipeline.vae_encoder, OVModelVaeEncoder) + self.assertIsInstance(ov_pipeline.vae_decoder, OVModelVaeDecoder) + self.assertIsInstance(ov_pipeline.unet, OVModelUnet) + self.assertIsInstance(ov_pipeline.config, Dict) + + from diffusers import LatentConsistencyModelPipeline + + pipeline = LatentConsistencyModelPipeline.from_pretrained(MODEL_NAMES[model_arch]) + batch_size, num_images_per_prompt, height, width = 2, 3, 64, 128 + latents = ov_pipeline.prepare_latents( + batch_size * num_images_per_prompt, + ov_pipeline.unet.config["in_channels"], + height, + width, + dtype=np.float32, + generator=np.random.RandomState(0), + ) + + kwargs = { + "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size, + "num_inference_steps": 1, + "num_images_per_prompt": num_images_per_prompt, + "height": height, + "width": width, + "guidance_scale": 8.5, + } + + for output_type in ["latent", "np"]: + ov_outputs = ov_pipeline(latents=latents, output_type=output_type, **kwargs).images + self.assertIsInstance(ov_outputs, np.ndarray) + with torch.no_grad(): + outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images + + # Compare model outputs + self.assertTrue(np.allclose(ov_outputs, outputs, atol=1e-4)) + # Compare model devices + self.assertEqual(pipeline.device.type, ov_pipeline.device) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @unittest.skipIf(parse(_diffusers_version) <= Version("0.21.4"), "not supported with this diffusers version") + def test_num_images_per_prompt_static_model(self, model_arch: str): + model_id = MODEL_NAMES[model_arch] + pipeline = self.MODEL_CLASS.from_pretrained(model_id, export=True, compile=False, dynamic_shapes=False) + batch_size, num_images, height, width = 3, 4, 128, 64 + pipeline.half() + pipeline.reshape(batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images) + self.assertFalse(pipeline.is_dynamic) + pipeline.compile() + + for _height in [height, height + 16]: + inputs = _generate_inputs(batch_size) + outputs = pipeline(**inputs, num_images_per_prompt=num_images, height=_height, width=width).images + self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) diff --git a/tests/openvino/test_training.py b/tests/openvino/test_training.py index 91defbefbb..006a99f2f9 100644 --- a/tests/openvino/test_training.py +++ b/tests/openvino/test_training.py @@ -318,7 +318,7 @@ def tearDown(self): "default_quantization": OVTrainerTestDescriptor( model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=DEFAULT_QUANTIZATION_CONFIG, - expected_fake_quantize=43, + expected_fake_quantize=42, expected_int8=32, compression_metrics=["compression_loss"], ), @@ -326,14 +326,14 @@ def tearDown(self): model_id="hf-internal-testing/tiny-random-bert", teacher_model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=DEFAULT_QUANTIZATION_CONFIG, - expected_fake_quantize=43, + expected_fake_quantize=42, expected_int8=32, compression_metrics=["compression_loss", "distillation_loss", "task_loss"], ), "customized_quantization": OVTrainerTestDescriptor( model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=CUSTOMIZED_QUANTIZATION_CONFIG, - expected_fake_quantize=70, + expected_fake_quantize=69, expected_int8=35, compression_metrics=["compression_loss"], ), @@ -341,7 +341,7 @@ def tearDown(self): model_id="hf-internal-testing/tiny-random-bert", teacher_model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=CUSTOMIZED_QUANTIZATION_CONFIG, - expected_fake_quantize=70, + expected_fake_quantize=69, expected_int8=35, compression_metrics=["compression_loss", "distillation_loss", "task_loss"], ), @@ -361,7 +361,7 @@ def tearDown(self): "default_quantization,structured_movement_sparsity": OVTrainerTestDescriptor( model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=[DEFAULT_QUANTIZATION_CONFIG, STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT], - expected_fake_quantize=43, + expected_fake_quantize=42, expected_int8=32, expected_binary_masks=60, compression_metrics=["compression_loss"], @@ -369,7 +369,7 @@ def tearDown(self): "customized_quantization,structured_movement_sparsity": OVTrainerTestDescriptor( model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=[CUSTOMIZED_QUANTIZATION_CONFIG, STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT], - expected_fake_quantize=70, + expected_fake_quantize=69, expected_int8=35, expected_binary_masks=60, compression_metrics=["compression_loss"], @@ -378,7 +378,7 @@ def tearDown(self): model_id="hf-internal-testing/tiny-random-bert", teacher_model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=[DEFAULT_QUANTIZATION_CONFIG, STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT], - expected_fake_quantize=43, + expected_fake_quantize=42, expected_int8=32, expected_binary_masks=60, compression_metrics=["compression_loss", "distillation_loss", "task_loss"], @@ -387,7 +387,7 @@ def tearDown(self): model_id="hf-internal-testing/tiny-random-bert", teacher_model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=[CUSTOMIZED_QUANTIZATION_CONFIG, STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT], - expected_fake_quantize=70, + expected_fake_quantize=69, expected_int8=35, expected_binary_masks=60, compression_metrics=["compression_loss", "distillation_loss", "task_loss"], @@ -408,7 +408,7 @@ def tearDown(self): "default_quantization,unstructured_movement_sparsity": OVTrainerTestDescriptor( model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=[DEFAULT_QUANTIZATION_CONFIG, UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT], - expected_fake_quantize=43, + expected_fake_quantize=42, expected_int8=32, expected_binary_masks=60, compression_metrics=["compression_loss"], @@ -416,7 +416,7 @@ def tearDown(self): "customized_quantization,unstructured_movement_sparsity": OVTrainerTestDescriptor( model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=[CUSTOMIZED_QUANTIZATION_CONFIG, UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT], - expected_fake_quantize=70, + expected_fake_quantize=69, expected_int8=35, expected_binary_masks=60, compression_metrics=["compression_loss"], @@ -425,7 +425,7 @@ def tearDown(self): model_id="hf-internal-testing/tiny-random-bert", teacher_model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=[DEFAULT_QUANTIZATION_CONFIG, UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT], - expected_fake_quantize=43, + expected_fake_quantize=42, expected_int8=32, expected_binary_masks=60, compression_metrics=["compression_loss", "distillation_loss", "task_loss"], @@ -434,7 +434,7 @@ def tearDown(self): model_id="hf-internal-testing/tiny-random-bert", teacher_model_id="hf-internal-testing/tiny-random-bert", nncf_compression_config=[CUSTOMIZED_QUANTIZATION_CONFIG, UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT], - expected_fake_quantize=70, + expected_fake_quantize=69, expected_int8=35, expected_binary_masks=60, compression_metrics=["compression_loss", "distillation_loss", "task_loss"], diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index c2feb4d264..6cfeb29bb4 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -24,7 +24,7 @@ "bart": "hf-internal-testing/tiny-random-bart", "bigbird_pegasus": "hf-internal-testing/tiny-random-bigbird_pegasus", "blenderbot-small": "hf-internal-testing/tiny-random-BlenderbotModel", - "blenderbot": "hf-internal-testing/tiny-random-blenderbot", + "blenderbot": "hf-internal-testing/tiny-random-BlenderbotModel", "bloom": "hf-internal-testing/tiny-random-BloomModel", "camembert": "hf-internal-testing/tiny-random-camembert", "convbert": "hf-internal-testing/tiny-random-ConvBertForSequenceClassification", @@ -39,7 +39,7 @@ "distilbert": "hf-internal-testing/tiny-random-distilbert", "electra": "hf-internal-testing/tiny-random-electra", "flaubert": "hf-internal-testing/tiny-random-flaubert", - # "gpt_bigcode": "bigcode/tiny_starcoder_py", + "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", "gpt2": "hf-internal-testing/tiny-random-gpt2", "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel", "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM", @@ -51,8 +51,10 @@ "llama": "fxmarty/tiny-llama-fast-tokenizer", "m2m_100": "hf-internal-testing/tiny-random-m2m_100", "opt": "hf-internal-testing/tiny-random-OPTModel", - "marian": "sshleifer/tiny-marian-en-de", # hf-internal-testing ones are broken + "opt125m": "facebook/opt-125m", + "marian": "sshleifer/tiny-marian-en-de", "mbart": "hf-internal-testing/tiny-random-mbart", + "mistral": "echarlaix/tiny-random-mistral", "mobilebert": "hf-internal-testing/tiny-random-MobileBertModel", "mobilenet_v1": "google/mobilenet_v1_0.75_192", "mobilenet_v2": "hf-internal-testing/tiny-random-MobileNetV2Model", @@ -67,12 +69,14 @@ "roberta": "hf-internal-testing/tiny-random-roberta", "roformer": "hf-internal-testing/tiny-random-roformer", "segformer": "hf-internal-testing/tiny-random-SegformerModel", + "speech_to_text": "hf-internal-testing/tiny-random-Speech2TextModel", "squeezebert": "hf-internal-testing/tiny-random-squeezebert", "stable-diffusion": "hf-internal-testing/tiny-stable-diffusion-torch", "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl", "stable-diffusion-xl-refiner": "echarlaix/tiny-random-stable-diffusion-xl-refiner", + "latent-consistency": "echarlaix/tiny-random-latent-consistency", "sew": "hf-internal-testing/tiny-random-SEWModel", - "sew_d": "hf-internal-testing/tiny-random-SEWDModel", + "sew_d": "asapp/sew-d-tiny-100k-ft-ls100h", "swin": "hf-internal-testing/tiny-random-SwinModel", "t5": "hf-internal-testing/tiny-random-t5", "unispeech": "hf-internal-testing/tiny-random-unispeech", @@ -82,6 +86,7 @@ "wav2vec2": "anton-l/wav2vec2-random-tiny-classifier", "wav2vec2-hf": "hf-internal-testing/tiny-random-Wav2Vec2Model", "wav2vec2-conformer": "hf-internal-testing/tiny-random-wav2vec2-conformer", + "whisper": "openai/whisper-tiny.en", "xlm": "hf-internal-testing/tiny-random-xlm", "xlm_roberta": "hf-internal-testing/tiny-xlm-roberta", } @@ -96,28 +101,34 @@ _ARCHITECTURES_TO_EXPECTED_INT8 = { - "bert": (34,), - "roberta": (34,), - "albert": (42,), - "vit": (31,), - "blenderbot": (35,), - "gpt2": (22,), - "wav2vec2": (15,), - "distilbert": (33,), - "t5": (32, 52, 42), - "stable-diffusion": (74, 4, 4, 32), - "stable-diffusion-xl": (148, 4, 4, 33), - "stable-diffusion-xl-refiner": (148, 4, 4, 33), + "bert": (68,), + "roberta": (68,), + "albert": (84,), + "vit": (62,), + "blenderbot": (70,), + "gpt2": (44,), + "wav2vec2": (30,), + "distilbert": (66,), + "t5": (64, 104, 84), + "stable-diffusion": (148, 8, 8, 64), + "stable-diffusion-xl": (296, 8, 8, 66), + "stable-diffusion-xl-refiner": (296, 8, 8, 66), } +_ARCHITECTURES_TO_EXPECTED_INT4_INT8 = {"opt125m": (82, 295)} + + def get_num_quantized_nodes(ov_model): num_fake_quantize = 0 num_int8 = 0 + num_int4 = 0 for elem in ov_model.model.get_ops(): if "FakeQuantize" in elem.name: num_fake_quantize += 1 for i in range(elem.get_output_size()): if "8" in elem.get_output_element_type(i).get_type_name(): num_int8 += 1 - return num_fake_quantize, num_int8 + if "4" in elem.get_output_element_type(i).get_type_name(): + num_int4 += 1 + return num_fake_quantize, num_int8, num_int4