diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml
index 96ef047aaf..2ed91535f7 100644
--- a/.github/workflows/test_ipex.yml
+++ b/.github/workflows/test_ipex.yml
@@ -21,7 +21,7 @@ jobs:
       fail-fast: false
       matrix:
         python-version: [3.8, 3.9]
-        transformers-version: [4.39.0, 4.41.2]
+        transformers-version: [4.39.0, 4.42.3]
         os: [ubuntu-latest]
 
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index ff38fb41df..6f9675cde7 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -21,7 +21,7 @@ jobs:
       fail-fast: false
       matrix:
         python-version: ["3.8", "3.12"]
-        transformers-version: ["4.36.0", "4.41.*"]
+        transformers-version: ["4.36.0", "4.42.*"]
         os: [ubuntu-latest]
 
     runs-on: ${{ matrix.os }}
diff --git a/docs/Dockerfile b/docs/Dockerfile
index 7cfa6cd514..4acde4e659 100644
--- a/docs/Dockerfile
+++ b/docs/Dockerfile
@@ -25,4 +25,4 @@ RUN npm install npm@9.8.1 -g && \
 RUN python3 -m pip install --no-cache-dir --upgrade pip
 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/doc-builder.git
 RUN git clone $clone_url && cd optimum-intel && git checkout $commit_sha
-RUN python3 -m pip install --no-cache-dir ./optimum-intel[neural-compressor,openvino,nncf,quality]
+RUN python3 -m pip install --no-cache-dir ./optimum-intel[neural-compressor,openvino,diffusers,quality]
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 1fd3fe6b71..7053a17ef2 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -22,6 +22,13 @@
       title: Supported Models
     - local: openvino/reference
       title: Reference
+    - sections:
+      - local: openvino/tutorials/notebooks
+        title: Notebooks
+      - local: openvino/tutorials/diffusers
+        title: Generate images with Diffusion models
+      title: Tutorials
+      isExpanded: false
     title: OpenVINO
   title: Optimum Intel
   isExpanded: false
diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx
index 8cffd06121..c11016fde1 100644
--- a/docs/source/openvino/export.mdx
+++ b/docs/source/openvino/export.mdx
@@ -14,25 +14,15 @@ specific language governing permissions and limitations under the License.
 To export your model to the [OpenVINO IR](https://docs.openvino.ai/2024/documentation/openvino-ir-format.html) format with the CLI :
 
 ```bash
-optimum-cli export openvino --model gpt2 ov_model/
+optimum-cli export openvino --model meta-llama/Meta-Llama-3-8B ov_model/
 ```
 
 The model argument can either be the model ID of a model hosted on the [Hub](https://huggingface.co/models) or a path to a model hosted locally. For local models, you need to specify the task for which the model should be loaded before export, among the list of the [supported tasks](https://huggingface.co/docs/optimum/main/en/exporters/task_manager).
 
-
 ```bash
-optimum-cli export openvino --model local_model_dir --task text-generation-with-past ov_model/
+optimum-cli export openvino --model local_llama --task text-generation-with-past ov_model/
 ```
 
-The `-with-past` suffix enable the re-use of past keys and values. This allows to avoid recomputing the same intermediate activations during the generation. to export the model without, you will need to remove this suffix.
-
-| With K-V cache                           | Without K-V cache                    |
-|------------------------------------------|--------------------------------------|
-| `text-generation-with-past`              | `text-generation`                    |
-| `text2text-generation-with-past`         | `text2text-generation`               |
-| `automatic-speech-recognition-with-past` | `automatic-speech-recognition`       |
-
-
 Check out the help for more options:
 
 ```bash
@@ -70,7 +60,7 @@ Optional arguments:
   --pad-token-id PAD_TOKEN_ID
                         This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it.
   --ratio RATIO         A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80% of the layers will be quantized to int4 while
-                        20% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 0.8.
+                        20% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0.
   --sym                 Whether to apply symmetric quantization
   --group-size GROUP_SIZE
                         The group size to use for int4 quantization. Recommended value is 128 and -1 will results in per-column quantization.
@@ -97,7 +87,7 @@ Optional arguments:
 You can also apply fp16, 8-bit or 4-bit weight-only quantization on the Linear, Convolutional and Embedding layers when exporting your model by setting `--weight-format` to respectively `fp16`, `int8` or `int4`:
 
 ```bash
-optimum-cli export openvino --model gpt2 --weight-format int8 ov_model/
+optimum-cli export openvino --model meta-llama/Meta-Llama-3-8B --weight-format int8 ov_model/
 ```
 
 For more information on the quantization parameters checkout the [documentation](inference#weight-only-quantization)
@@ -109,6 +99,33 @@ Models larger than 1 billion parameters are exported to the OpenVINO format with
 
 </Tip>
 
+
+### Decoder models
+
+For models with a decoder, we enable the re-use of past keys and values by default. This allows to avoid recomputing the same intermediate activations at each generation step. To export the model without, you will need to remove the `-with-past` suffix when specifying the task.
+
+| With K-V cache                           | Without K-V cache                    |
+|------------------------------------------|--------------------------------------|
+| `text-generation-with-past`              | `text-generation`                    |
+| `text2text-generation-with-past`         | `text2text-generation`               |
+| `automatic-speech-recognition-with-past` | `automatic-speech-recognition`       |
+
+
+### Diffusion models
+
+When Stable Diffusion models are exported to the OpenVINO format, they are decomposed into different components that are later combined during inference:
+
+* Text encoder(s)
+* U-Net
+* VAE encoder
+* VAE decoder
+
+To export your Stable Diffusion XL model to the OpenVINO IR format with the CLI you can do as follows:
+
+```bash
+optimum-cli export openvino --model stabilityai/stable-diffusion-xl-base-1.0 ov_sdxl/
+```
+
 ## When loading your model
 
 You can also load your PyTorch checkpoint and convert it to the OpenVINO format on-the-fly, by setting `export=True` when loading your model.
@@ -121,7 +138,7 @@ To easily save the resulting model, you can use the `save_pretrained()` method,
 + from optimum.intel import OVModelForCausalLM
   from transformers import AutoTokenizer
 
-  model_id = "gpt2"
+  model_id = "meta-llama/Meta-Llama-3-8B"
 - model = AutoModelForCausalLM.from_pretrained(model_id)
 + model = OVModelForCausalLM.from_pretrained(model_id, export=True)
   tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -137,7 +154,7 @@ To easily save the resulting model, you can use the `save_pretrained()` method,
 from transformers import AutoModelForCausalLM
 from optimum.exporters.openvino import export_from_model
 
-model = AutoModelForCausalLM.from_pretrained("gpt2")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B")
 export_from_model(model, output="ov_model", task="text-generation-with-past")
 ```
 
diff --git a/docs/source/openvino/inference.mdx b/docs/source/openvino/inference.mdx
index 822d6e2f99..0f2e30af47 100644
--- a/docs/source/openvino/inference.mdx
+++ b/docs/source/openvino/inference.mdx
@@ -11,9 +11,12 @@ specific language governing permissions and limitations under the License.
 
 Optimum Intel can be used to load optimized models from the [Hub](https://huggingface.co/models?library=openvino&sort=downloads) and create pipelines to run inference with OpenVINO Runtime on a variety of Intel processors ([see](https://docs.openvino.ai/2024/about-openvino/compatibility-and-support/supported-devices.html) the full list of supported devices)
 
-## Transformers models
 
-Once [your model was exported](export), you can load it by replacing the `AutoModelForXxx` class with the corresponding `OVModelForXxx` class.
+## Loading
+
+### Transformers models
+
+Once [your model was exported](export), you can load it by replacing the `AutoModelForXxx` class with the corresponding `OVModelForXxx`.
 
 ```diff
 - from transformers import AutoModelForCausalLM
@@ -22,403 +25,157 @@ Once [your model was exported](export), you can load it by replacing the `AutoMo
 
   model_id = "helenai/gpt2-ov"
 - model = AutoModelForCausalLM.from_pretrained(model_id)
+  # here the model was already exported so no need to set export=True
 + model = OVModelForCausalLM.from_pretrained(model_id)
   tokenizer = AutoTokenizer.from_pretrained(model_id)
   pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
   results = pipe("He's a dreadful magician and")
 ```
 
-See the [reference documentation](reference) for more information about parameters, and examples for different tasks.
-
 As shown in the table below, each task is associated with a class enabling to automatically load your model.
 
-| Task                                 | Auto Class                           |
+| Auto Class                           | Task                                 |
 |--------------------------------------|--------------------------------------|
-| `text-classification`                | `OVModelForSequenceClassification`   |
-| `token-classification`               | `OVModelForTokenClassification`      |
-| `question-answering`                 | `OVModelForQuestionAnswering`        |
-| `audio-classification`               | `OVModelForAudioClassification`      |
-| `image-classification`               | `OVModelForImageClassification`      |
-| `feature-extraction`                 | `OVModelForFeatureExtraction`        |
-| `fill-mask`                          | `OVModelForMaskedLM`                 |
-| `image-classification`               | `OVModelForImageClassification`      |
-| `audio-classification`               | `OVModelForAudioClassification`      |
-| `text-generation-with-past`          | `OVModelForCausalLM`                 |
-| `text2text-generation-with-past`     | `OVModelForSeq2SeqLM`                |
-| `automatic-speech-recognition`       | `OVModelForSpeechSeq2Seq`            |
-| `image-to-text`                      | `OVModelForVision2Seq`               |
-
-
-### Weight-only quantization
-
-You can also apply fp16, 8-bit or 4-bit weight compression on the Linear, Convolutional and Embedding layers when loading your model to reduce the memory footprint and inference latency.
-
-For more information on the quantization parameters checkout the [documentation](optimziation#weight-only-quantization).
+| `OVModelForSequenceClassification`   | `text-classification`                |
+| `OVModelForTokenClassification`      | `token-classification`               |
+| `OVModelForQuestionAnswering`        | `question-answering`                 |
+| `OVModelForAudioClassification`      | `audio-classification`               |
+| `OVModelForImageClassification`      | `image-classification`               |
+| `OVModelForFeatureExtraction`        | `feature-extraction`                 |
+| `OVModelForMaskedLM`                 | `fill-mask`                          |
+| `OVModelForImageClassification`      | `image-classification`               |
+| `OVModelForAudioClassification`      | `audio-classification`               |
+| `OVModelForCausalLM`                 | `text-generation-with-past`          |
+| `OVModelForSeq2SeqLM`                | `text2text-generation-with-past`     |
+| `OVModelForSpeechSeq2Seq`            | `automatic-speech-recognition`       |
+| `OVModelForVision2Seq`               | `image-to-text`                      |
+
+
+### Diffusers models
 
-<Tip warning={true}>
+Make sure you have 🤗 Diffusers installed. To install `diffusers`:
 
-If not specified, `load_in_8bit` will be set to `True` by default when models larger than 1 billion parameters are exported to the OpenVINO format (with `export=True`). You can disable it with `load_in_8bit=False`.
+```bash
+pip install optimum[diffusers]
+```
 
-</Tip>
+```diff
+- from diffusers import StableDiffusionPipeline
++ from optimum.intel import OVStableDiffusionPipeline
 
-It's also possible to apply quantization on both weights and activations using the `OVQuantizer`, more information in the [documentation](optimization#static-quantization).
+  model_id = "echarlaix/stable-diffusion-v1-5-openvino"
+- pipeline = StableDiffusionPipeline.from_pretrained(model_id)
++ pipeline = OVStableDiffusionPipeline.from_pretrained(model_id)
+  prompt = "sailing ship in storm by Rembrandt"
+  images = pipeline(prompt).images
+```
 
-### Static shape
 
-By default, `OVModelForXxx` support dynamic shapes, enabling inputs of every shapes. To speed up inference, static shapes can be enabled by giving the desired inputs shapes.
+As shown in the table below, each task is associated with a class enabling to automatically load your model.
 
-```python
-# Fix the batch size to 1 and the sequence length to 9
-model.reshape(1, 9)
-# Compile the model before the first inference
-model.compile()
-```
+| Auto Class                           | Task                                 |
+|--------------------------------------|--------------------------------------|
+| `OVStableDiffusionPipeline`          | `text-to-image`                      |
+| `OVStableDiffusionImg2ImgPipeline`   | `image-to-image`                     |
+| `OVStableDiffusionInpaintPipeline`   | `inpaint`                            |
+| `OVStableDiffusionXLPipeline`        | `text-to-image`                      |
+| `OVStableDiffusionXLImg2ImgPipeline` | `image-to-image`                     |
+| `OVLatentConsistencyModelPipeline`   | `text-to-image`                      |
 
-When fixing the shapes with the `reshape()` method, inference cannot be performed with an input of a different shape. When instantiating your pipeline, you can specify the maximum total input sequence length after tokenization in order for shorter sequences to be padded and for longer sequences to be truncated.
 
-```python
-from datasets import load_dataset
-from transformers import AutoTokenizer, pipeline
-from evaluate import evaluator
-from optimum.intel import OVModelForQuestionAnswering
+See the [reference documentation](reference) for more information about parameters, and examples for different tasks.
 
-model_id = "distilbert-base-cased-distilled-squad"
-model = OVModelForQuestionAnswering.from_pretrained(model_id, export=True)
-model.reshape(1, 384)
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-eval_dataset = load_dataset("squad", split="validation").select(range(50))
-task_evaluator = evaluator("question-answering")
-qa_pipe = pipeline(
-    "question-answering",
-    model=model,
-    tokenizer=tokenizer,
-    max_seq_len=384,
-    padding="max_length",
-    truncation=True,
-)
-metric = task_evaluator.compute(model_or_pipeline=qa_pipe, data=eval_dataset, metric="squad")
-```
 
-### Compilation
+## Compilation
 
-By default the model will be compiled when instantiating our `OVModel`. In the case where the model is reshaped or placed to another device, the model will need to be recompiled again, which will happen by default before the first inference (thus inflating the latency of the first inference). To avoid an unnecessary compilation, you can disable the first compilation by setting `compile=False`. The model can be compiled before the first inference with `model.compile()`.
+By default the model will be compiled when instantiating an `OVModel`. In the case where the model is reshaped or placed to another device, the model will need to be recompiled again, which will happen by default before the first inference (thus inflating the latency of the first inference). To avoid an unnecessary compilation, you can disable the first compilation by setting `compile=False`.
 
 ```python
-from optimum.intel import OVModelForSequenceClassification
+from optimum.intel import OVModelForQuestionAnswering
 
-model_id = "distilbert-base-uncased-finetuned-sst-2-english"
+model_id = "distilbert/distilbert-base-cased-distilled-squad"
 # Load the model and disable the model compilation
-model = OVModelForSequenceClassification.from_pretrained(model_id, export=True, compile=False)
-# Reshape to a static sequence length of 128
-model.reshape(1,128)
-# Compile the model before the first inference
-model.compile()
+model = OVModelForQuestionAnswering.from_pretrained(model_id, compile=False)
 ```
 
 To run inference on Intel integrated or discrete GPU, use `.to("gpu")`. On GPU, models run in FP16 precision by default. (See [OpenVINO documentation](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html) about installing drivers for GPU inference).
 
 ```python
-# Static shapes speed up inference
-model.reshape(1, 9)
 model.to("gpu")
-# Compile the model before the first inference
-model.compile()
-```
-
-### Configuration
-
-
-It is possible to pass an `ov_config` parameter to `from_pretrained()` with custom OpenVINO configuration values. This can be used for example to enable full precision inference on devices where FP16 or BF16 inference precision is used by default.
-
-
-```python
-model = OVModelForSequenceClassification.from_pretrained(model_id, ov_config={"INFERENCE_PRECISION_HINT":"f32"})
-```
-
-Optimum Intel leverages OpenVINO's model caching to speed up model compiling on GPU. By default a `model_cache` directory is created in the model's directory in the [Hugging Face Hub cache](https://huggingface.co/docs/huggingface_hub/main/en/guides/manage-cache). To override this, use the ov_config parameter and set `CACHE_DIR` to a different value. To disable model caching on GPU, set `CACHE_DIR` to an empty string.
-
-```python
-model = OVModelForSequenceClassification.from_pretrained(model_id, device="GPU", ov_config={"PERFORMANCE_HINT": "LATENCY", "CACHE_DIR":""})
-```
-
-### Sequence-to-sequence models
-
-Sequence-to-sequence (Seq2Seq) models, that generate a new sequence from an input, can also be used when running inference with OpenVINO. When Seq2Seq models are exported to the OpenVINO IR, they are decomposed into two parts : the encoder and the "decoder" (which actually consists of the decoder with the language modeling head), that are later combined during inference.
-To speed up sequential decoding, a cache with pre-computed key/values hidden-states will be used by default. An additional model component will be exported: the "decoder" with pre-computed key/values as one of its inputs.  This specific export comes from the fact that during the first pass, the decoder has no pre-computed key/values hidden-states, while during the rest of the generation past key/values will be used to speed up sequential decoding. To disable this cache, set `use_cache=False` in the `from_pretrained()` method.
-
-Here is an example on how you can run inference for a translation task using a T5 model and then export it to OpenVINO IR:
-
-
-```python
-from transformers import AutoTokenizer, pipeline
-from optimum.intel import OVModelForSeq2SeqLM
-
-model_id = "t5-small"
-model = OVModelForSeq2SeqLM.from_pretrained(model_id, export=True)
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-translation_pipe = pipeline("translation_en_to_fr", model=model, tokenizer=tokenizer)
-text = "He never went out without a book under his arm, and he often came back with two."
-result = translation_pipe(text)
-
-# Save the exported model
-save_directory = "openvino_t5"
-model.save_pretrained(save_directory)
-tokenizer.save_pretrained(save_directory)
-
-[{'translation_text': "Il n'est jamais sorti sans un livre sous son bras, et il est souvent revenu avec deux."}]
-```
-
-## Diffusers models
-
-Make sure you have 🤗 Diffusers installed. To install `diffusers`:
-
-```bash
-pip install optimum[diffusers]
 ```
 
-### Stable Diffusion
-
-Stable Diffusion models can also be used when running inference with OpenVINO. When Stable Diffusion models
-are exported to the OpenVINO format, they are decomposed into different components that are later combined during inference:
-- The text encoder
-- The U-NET
-- The VAE encoder
-- The VAE decoder
-
-| Task                                 | Auto Class                           |
-|--------------------------------------|--------------------------------------|
-| `text-to-image`                      | `OVStableDiffusionPipeline`          |
-| `image-to-image`                     | `OVStableDiffusionImg2ImgPipeline`   |
-| `inpaint`                            | `OVStableDiffusionInpaintPipeline`   |
-
-
-#### Text-to-Image
-Here is an example of how you can load an OpenVINO Stable Diffusion model and run inference using OpenVINO Runtime:
+The model can be compiled:
 
 ```python
-from optimum.intel import OVStableDiffusionPipeline
-
-model_id = "echarlaix/stable-diffusion-v1-5-openvino"
-pipeline = OVStableDiffusionPipeline.from_pretrained(model_id)
-prompt = "sailing ship in storm by Rembrandt"
-images = pipeline(prompt).images
+model.compile()
 ```
 
-To load your PyTorch model and convert it to OpenVINO on the fly, you can set `export=True`.
-
-```python
-model_id = "runwayml/stable-diffusion-v1-5"
-pipeline = OVStableDiffusionPipeline.from_pretrained(model_id, export=True)
-# Don't forget to save the exported model
-pipeline.save_pretrained("openvino-sd-v1-5")
-```
+## Static shape
 
-To further speed up inference, the model can be statically reshaped :
+By default, dynamic shapes are supported, enabling inference for inputs of every shape. To speed up inference, static shapes can be enabled by giving the desired input shapes with [.reshape()](reference#optimum.intel.OVBaseModel.reshape).
 
 ```python
-# Define the shapes related to the inputs and desired outputs
-batch_size = 1
-num_images_per_prompt = 1
-height = 512
-width = 512
-
-# Statically reshape the model
-pipeline.reshape(batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images_per_prompt)
-# Compile the model before the first inference
-pipeline.compile()
-
-# Run inference
-images = pipeline(prompt, height=height, width=width, num_images_per_prompt=num_images_per_prompt).images
+# Fix the batch size to 1 and the sequence length to 40
+batch_size, seq_len = 1, 40
+model.reshape(batch_size, seq_len)
 ```
 
-In case you want to change any parameters such as the outputs height or width, you'll need to statically reshape your model once again.
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/stable_diffusion_v1_5_sail_boat_rembrandt.png">
-</div>
-
-#### Text-to-Image with Textual Inversion
-Here is an example of how you can load an OpenVINO Stable Diffusion model with pre-trained textual inversion embeddings and run inference using OpenVINO Runtime:
-
-
-First, you can run original pipeline without textual inversion
-```python
-from optimum.intel import OVStableDiffusionPipeline
-import numpy as np
-
-model_id = "echarlaix/stable-diffusion-v1-5-openvino"
-prompt = "A <cat-toy> back-pack"
-# Set a random seed for better comparison
-np.random.seed(42)
-
-pipeline = OVStableDiffusionPipeline.from_pretrained(model_id, export=False, compile=False)
-pipeline.compile()
-image1 = pipeline(prompt, num_inference_steps=50).images[0]
-image1.save("stable_diffusion_v1_5_without_textual_inversion.png")
-```
+When fixing the shapes with the `reshape()` method, inference cannot be performed with an input of a different shape.
 
-Then, you can load [sd-concepts-library/cat-toy](https://huggingface.co/sd-concepts-library/cat-toy) textual inversion embedding and run pipeline with same prompt again
 ```python
-# Reset stable diffusion pipeline
-pipeline.clear_requests()
 
-# Load textual inversion into stable diffusion pipeline
-pipeline.load_textual_inversion("sd-concepts-library/cat-toy", "<cat-toy>")
+from transformers import AutoTokenizer
+from optimum.intel import OVModelForQuestionAnswering
 
+model_id = "distilbert/distilbert-base-cased-distilled-squad"
+model = OVModelForQuestionAnswering.from_pretrained(model_id, compile=False)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+batch_size, seq_len = 1, 40
+model.reshape(batch_size, seq_len)
 # Compile the model before the first inference
-pipeline.compile()
-image2 = pipeline(prompt, num_inference_steps=50).images[0]
-image2.save("stable_diffusion_v1_5_with_textual_inversion.png")
-```
-The left image shows the generation result of original stable diffusion v1.5, the right image shows the generation result of stable diffusion v1.5 with textual inversion.
-
-|   |   |
-|---|---|
-| ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/textual_inversion/stable_diffusion_v1_5_without_textual_inversion.png) | ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/textual_inversion/stable_diffusion_v1_5_with_textual_inversion.png) |
-
-
-#### Image-to-Image
-
-```python
-import requests
-import torch
-from PIL import Image
-from io import BytesIO
-from optimum.intel import OVStableDiffusionImg2ImgPipeline
-
-model_id = "runwayml/stable-diffusion-v1-5"
-pipeline = OVStableDiffusionImg2ImgPipeline.from_pretrained(model_id, export=True)
-
-url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
-response = requests.get(url)
-init_image = Image.open(BytesIO(response.content)).convert("RGB")
-init_image = init_image.resize((768, 512))
-prompt = "A fantasy landscape, trending on artstation"
-image = pipeline(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images[0]
-image.save("fantasy_landscape.png")
-```
-
-### Stable Diffusion XL
-
-| Task                                 | Auto Class                           |
-|--------------------------------------|--------------------------------------|
-| `text-to-image`                      | `OVStableDiffusionXLPipeline`        |
-| `image-to-image`                     | `OVStableDiffusionXLImg2ImgPipeline` |
-
-
-#### Text-to-Image
-
-Here is an example of how you can load a SDXL OpenVINO model from [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) and run inference using OpenVINO Runtime:
+model.compile()
 
-```python
-from optimum.intel import OVStableDiffusionXLPipeline
+question = "Which name is also used to describe the Amazon rainforest ?"
+context = "The Amazon rainforest, also known as Amazonia or the Amazon Jungle"
+tokens = tokenizer(question, context, max_length=seq_len, padding="max_length", return_tensors="np")
 
-model_id = "stabilityai/stable-diffusion-xl-base-1.0"
-base = OVStableDiffusionXLPipeline.from_pretrained(model_id)
-prompt = "train station by Caspar David Friedrich"
-image = base(prompt).images[0]
-image.save("train_station.png")
+outputs = model(**tokens)
 ```
 
-|   |   |
-|---|---|
-| ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/sd_xl/train_station_friedrich.png) | ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/sd_xl/train_station_friedrich_2.png) |
-
-#### Text-to-Image with Textual Inversion
-
-Here is an example of how you can load an SDXL OpenVINO model from [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) with pre-trained textual inversion embeddings and run inference using OpenVINO Runtime:
-
-
-First, you can run original pipeline without textual inversion
-```python
-from optimum.intel import OVStableDiffusionXLPipeline
-import numpy as np
-
-model_id = "stabilityai/stable-diffusion-xl-base-1.0"
-prompt = "charturnerv2, multiple views of the same character in the same outfit, a character turnaround of a beautiful woman wearing a red jacket and black shirt, best quality, intricate details."
-# Set a random seed for better comparison
-np.random.seed(112)
-
-base = OVStableDiffusionXLPipeline.from_pretrained(model_id, export=False, compile=False)
-base.compile()
-image1 = base(prompt, num_inference_steps=50).images[0]
-image1.save("sdxl_without_textual_inversion.png")
-```
+For models that handle images, you can also specify the `height` and `width` when reshaping your model:
 
-Then, you can load [charturnerv2](https://civitai.com/models/3036/charturner-character-turnaround-helper-for-15-and-21) textual inversion embedding and run pipeline with same prompt again
 ```python
-# Reset stable diffusion pipeline
-base.clear_requests()
-
-# Load textual inversion into stable diffusion pipeline
-base.load_textual_inversion("./charturnerv2.pt", "charturnerv2")
-
-# Compile the model before the first inference
-base.compile()
-image2 = base(prompt, num_inference_steps=50).images[0]
-image2.save("sdxl_with_textual_inversion.png")
-
-The left image shows the generation result of the original SDXL base 1.0, the right image shows the generation result of SDXL base 1.0 with textual inversion.
+batch_size, num_images, height, width = 1, 1, 512, 512
+pipeline.reshape(batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images)
+images = pipeline(prompt, height=height, width=width, num_images_per_prompt=num_images).images
 ```
 
-|   |   |
-|---|---|
-| ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/textual_inversion/sdxl_without_textual_inversion.png) | ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/textual_inversion/sdxl_with_textual_inversion.png) |
-
+## Configuration
 
-#### Image-to-Image
-
-Here is an example of how you can load a PyTorch SDXL model, convert it to OpenVINO on-the-fly and run inference using OpenVINO Runtime for *image-to-image*:
+The `ov_config` parameter allow to provide custom OpenVINO configuration values. This can be used for example to enable full precision inference on devices where FP16 or BF16 inference precision is used by default.
 
 ```python
-from optimum.intel import OVStableDiffusionXLImg2ImgPipeline
-from diffusers.utils import load_image
-
-model_id = "stabilityai/stable-diffusion-xl-refiner-1.0"
-pipeline = OVStableDiffusionXLImg2ImgPipeline.from_pretrained(model_id, export=True)
-
-url = "https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/sd_xl/castle_friedrich.png"
-image = load_image(url).convert("RGB")
-prompt = "medieval castle by Caspar David Friedrich"
-image = pipeline(prompt, image=image).images[0]
-# Don't forget to save your OpenVINO model so that you can load it without exporting it with `export=True`
-pipeline.save_pretrained("openvino-sd-xl-refiner-1.0")
+ov_config = {"INFERENCE_PRECISION_HINT": "f32"}
+model = OVModelForSequenceClassification.from_pretrained(model_id, ov_config=ov_config)
 ```
 
-
-#### Refining the image output
-
-The image can be refined by making use of a model like [stabilityai/stable-diffusion-xl-refiner-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0). In this case, you only have to output the latents from the base model.
-
+Optimum Intel leverages OpenVINO's model caching to speed up model compiling on GPU. By default a `model_cache` directory is created in the model's directory in the [Hugging Face Hub cache](https://huggingface.co/docs/huggingface_hub/main/en/guides/manage-cache). To override this, use the ov_config parameter and set `CACHE_DIR` to a different value. To disable model caching on GPU, set `CACHE_DIR` to an empty string.
 
 ```python
-from optimum.intel import OVStableDiffusionXLImg2ImgPipeline
-
-model_id = "stabilityai/stable-diffusion-xl-refiner-1.0"
-refiner = OVStableDiffusionXLImg2ImgPipeline.from_pretrained(model_id, export=True)
-
-image = base(prompt=prompt, output_type="latent").images[0]
-image = refiner(prompt=prompt, image=image[None, :]).images[0]
+ov_config = {"CACHE_DIR": ""}
+model = OVModelForSequenceClassification.from_pretrained(model_id, device="gpu", ov_config=ov_config)
 ```
 
+## Weight quantization
 
-### Latent Consistency Models
-
-
-| Task                                 | Auto Class                           |
-|--------------------------------------|--------------------------------------|
-| `text-to-image`                      | `OVLatentConsistencyModelPipeline`   |
+You can also apply fp16, 8-bit or 4-bit weight compression on the Linear, Convolutional and Embedding layers when loading your model to reduce the memory footprint and inference latency.
 
+For more information on the quantization parameters checkout the [documentation](optimziation#weight-only-quantization).
 
-#### Text-to-Image
+<Tip warning={true}>
 
-Here is an example of how you can load a Latent Consistency Models (LCMs) from [SimianLuo/LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) and run inference using OpenVINO :
+If not specified, `load_in_8bit` will be set to `True` by default when models larger than 1 billion parameters are exported to the OpenVINO format (with `export=True`). You can disable it with `load_in_8bit=False`.
 
-```python
-from optimum.intel import OVLatentConsistencyModelPipeline
+</Tip>
 
-model_id = "SimianLuo/LCM_Dreamshaper_v7"
-pipeline = OVLatentConsistencyModelPipeline.from_pretrained(model_id, export=True)
-prompt = "sailing ship in storm by Leonardo da Vinci"
-images = pipeline(prompt, num_inference_steps=4, guidance_scale=8.0).images
-```
+It's also possible to apply quantization on both weights and activations using the [`OVQuantizer`](optimization#static-quantization).
diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx
index 523928b8de..83acd37efc 100644
--- a/docs/source/openvino/models.mdx
+++ b/docs/source/openvino/models.mdx
@@ -7,6 +7,8 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->
 
+# Supported models
+
 🤗 Optimum handles the export of models to OpenVINO in the `exporters.openvino` module. It provides classes, functions, and a command line interface to perform the export easily.
 Here is the list of the supported architectures :
 
diff --git a/docs/source/openvino/reference.mdx b/docs/source/openvino/reference.mdx
index 32385eae00..c5043d877a 100644
--- a/docs/source/openvino/reference.mdx
+++ b/docs/source/openvino/reference.mdx
@@ -16,6 +16,12 @@ limitations under the License.
 
 # Models
 
+## Generic model classes
+
+[[autodoc]] openvino.modeling_base.OVBaseModel
+    - _from_pretrained
+    - reshape
+
 ## Natural Language Processing
 
 The following classes are available for the following natural language processing tasks.
@@ -106,6 +112,7 @@ The following classes are available for the following multimodal tasks.
 [[autodoc]] openvino.modeling_seq2seq.OVModelForPix2Struct
     - forward
 
+
 ##  Custom Tasks
 
 ### OVModelForCustomTasks
@@ -119,8 +126,40 @@ The following classes are available for the following multimodal tasks.
     - forward
 
 
-# Quantization
+## Text-to-image
+
+### OVStableDiffusionPipeline
+
+[[autodoc]] openvino.modeling_diffusion.OVStableDiffusionPipeline
+    - forward
+
+### OVStableDiffusionXLPipeline
+
+[[autodoc]] openvino.modeling_diffusion.OVStableDiffusionXLPipeline
+    - forward
+
+### OVLatentConsistencyModelPipeline
+
+[[autodoc]] openvino.modeling_diffusion.OVLatentConsistencyModelPipeline
+    - forward
 
-### OVQuantizer
 
-[[autodoc]] openvino.quantization.OVQuantizer
+## Image-to-image
+
+### OVStableDiffusionImg2ImgPipeline
+
+[[autodoc]] openvino.modeling_diffusion.OVStableDiffusionImg2ImgPipeline
+    - forward
+
+### OVStableDiffusionXLImg2ImgPipeline
+
+[[autodoc]] openvino.modeling_diffusion.OVStableDiffusionXLImg2ImgPipeline
+    - forward
+
+## Inpainting
+
+### OVStableDiffusionInpaintPipeline
+
+[[autodoc]] openvino.modeling_diffusion.OVStableDiffusionInpaintPipeline
+    - forward
+
diff --git a/docs/source/openvino/tutorials/diffusers.mdx b/docs/source/openvino/tutorials/diffusers.mdx
new file mode 100644
index 0000000000..dad09420b1
--- /dev/null
+++ b/docs/source/openvino/tutorials/diffusers.mdx
@@ -0,0 +1,250 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Generate images with Diffusion models
+
+## Stable Diffusion
+
+Stable Diffusion models can also be used when running inference with OpenVINO. When Stable Diffusion models
+are exported to the OpenVINO format, they are decomposed into different components that are later combined during inference:
+- The text encoder
+- The U-NET
+- The VAE encoder
+- The VAE decoder
+
+| Task                                 | Auto Class                           |
+|--------------------------------------|--------------------------------------|
+| `text-to-image`                      | `OVStableDiffusionPipeline`          |
+| `image-to-image`                     | `OVStableDiffusionImg2ImgPipeline`   |
+| `inpaint`                            | `OVStableDiffusionInpaintPipeline`   |
+
+
+### Text-to-Image
+Here is an example of how you can load an OpenVINO Stable Diffusion model and run inference using OpenVINO Runtime:
+
+```python
+from optimum.intel import OVStableDiffusionPipeline
+
+model_id = "echarlaix/stable-diffusion-v1-5-openvino"
+pipeline = OVStableDiffusionPipeline.from_pretrained(model_id)
+prompt = "sailing ship in storm by Rembrandt"
+images = pipeline(prompt).images
+```
+
+To load your PyTorch model and convert it to OpenVINO on the fly, you can set `export=True`.
+
+```python
+model_id = "runwayml/stable-diffusion-v1-5"
+pipeline = OVStableDiffusionPipeline.from_pretrained(model_id, export=True)
+# Don't forget to save the exported model
+pipeline.save_pretrained("openvino-sd-v1-5")
+```
+
+To further speed up inference, the model can be statically reshaped :
+
+```python
+# Define the shapes related to the inputs and desired outputs
+batch_size = 1
+num_images_per_prompt = 1
+height = 512
+width = 512
+
+# Statically reshape the model
+pipeline.reshape(batch_size=batch_size, height=height, width=width, num_images_per_prompt=num_images_per_prompt)
+# Compile the model before the first inference
+pipeline.compile()
+
+# Run inference
+images = pipeline(prompt, height=height, width=width, num_images_per_prompt=num_images_per_prompt).images
+```
+
+In case you want to change any parameters such as the outputs height or width, you'll need to statically reshape your model once again.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/stable_diffusion_v1_5_sail_boat_rembrandt.png">
+</div>
+
+### Text-to-Image with Textual Inversion
+Here is an example of how you can load an OpenVINO Stable Diffusion model with pre-trained textual inversion embeddings and run inference using OpenVINO Runtime:
+
+
+First, you can run original pipeline without textual inversion
+```python
+from optimum.intel import OVStableDiffusionPipeline
+import numpy as np
+
+model_id = "echarlaix/stable-diffusion-v1-5-openvino"
+prompt = "A <cat-toy> back-pack"
+# Set a random seed for better comparison
+np.random.seed(42)
+
+pipeline = OVStableDiffusionPipeline.from_pretrained(model_id, export=False, compile=False)
+pipeline.compile()
+image1 = pipeline(prompt, num_inference_steps=50).images[0]
+image1.save("stable_diffusion_v1_5_without_textual_inversion.png")
+```
+
+Then, you can load [sd-concepts-library/cat-toy](https://huggingface.co/sd-concepts-library/cat-toy) textual inversion embedding and run pipeline with same prompt again
+```python
+# Reset stable diffusion pipeline
+pipeline.clear_requests()
+
+# Load textual inversion into stable diffusion pipeline
+pipeline.load_textual_inversion("sd-concepts-library/cat-toy", "<cat-toy>")
+
+# Compile the model before the first inference
+pipeline.compile()
+image2 = pipeline(prompt, num_inference_steps=50).images[0]
+image2.save("stable_diffusion_v1_5_with_textual_inversion.png")
+```
+The left image shows the generation result of original stable diffusion v1.5, the right image shows the generation result of stable diffusion v1.5 with textual inversion.
+
+|   |   |
+|---|---|
+| ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/textual_inversion/stable_diffusion_v1_5_without_textual_inversion.png) | ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/textual_inversion/stable_diffusion_v1_5_with_textual_inversion.png) |
+
+
+### Image-to-Image
+
+```python
+import requests
+import torch
+from PIL import Image
+from io import BytesIO
+from optimum.intel import OVStableDiffusionImg2ImgPipeline
+
+model_id = "runwayml/stable-diffusion-v1-5"
+pipeline = OVStableDiffusionImg2ImgPipeline.from_pretrained(model_id, export=True)
+
+url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+init_image = init_image.resize((768, 512))
+prompt = "A fantasy landscape, trending on artstation"
+image = pipeline(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images[0]
+image.save("fantasy_landscape.png")
+```
+
+## Stable Diffusion XL
+
+| Task                                 | Auto Class                           |
+|--------------------------------------|--------------------------------------|
+| `text-to-image`                      | `OVStableDiffusionXLPipeline`        |
+| `image-to-image`                     | `OVStableDiffusionXLImg2ImgPipeline` |
+
+
+### Text-to-Image
+
+Here is an example of how you can load a SDXL OpenVINO model from [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) and run inference using OpenVINO Runtime:
+
+```python
+from optimum.intel import OVStableDiffusionXLPipeline
+
+model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+base = OVStableDiffusionXLPipeline.from_pretrained(model_id)
+prompt = "train station by Caspar David Friedrich"
+image = base(prompt).images[0]
+image.save("train_station.png")
+```
+
+|   |   |
+|---|---|
+| ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/sd_xl/train_station_friedrich.png) | ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/sd_xl/train_station_friedrich_2.png) |
+
+### Text-to-Image with Textual Inversion
+
+Here is an example of how you can load an SDXL OpenVINO model from [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) with pre-trained textual inversion embeddings and run inference using OpenVINO Runtime:
+
+
+First, you can run original pipeline without textual inversion
+```python
+from optimum.intel import OVStableDiffusionXLPipeline
+import numpy as np
+
+model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+prompt = "charturnerv2, multiple views of the same character in the same outfit, a character turnaround wearing a red jacket and black shirt, best quality, intricate details."
+# Set a random seed for better comparison
+np.random.seed(112)
+
+base = OVStableDiffusionXLPipeline.from_pretrained(model_id, export=False, compile=False)
+base.compile()
+image1 = base(prompt, num_inference_steps=50).images[0]
+image1.save("sdxl_without_textual_inversion.png")
+```
+
+Then, you can load [charturnerv2](https://civitai.com/models/3036/charturner-character-turnaround-helper-for-15-and-21) textual inversion embedding and run pipeline with same prompt again
+```python
+# Reset stable diffusion pipeline
+base.clear_requests()
+
+# Load textual inversion into stable diffusion pipeline
+base.load_textual_inversion("./charturnerv2.pt", "charturnerv2")
+
+# Compile the model before the first inference
+base.compile()
+image2 = base(prompt, num_inference_steps=50).images[0]
+image2.save("sdxl_with_textual_inversion.png")
+```
+
+### Image-to-Image
+
+Here is an example of how you can load a PyTorch SDXL model, convert it to OpenVINO on-the-fly and run inference using OpenVINO Runtime for *image-to-image*:
+
+```python
+from optimum.intel import OVStableDiffusionXLImg2ImgPipeline
+from diffusers.utils import load_image
+
+model_id = "stabilityai/stable-diffusion-xl-refiner-1.0"
+pipeline = OVStableDiffusionXLImg2ImgPipeline.from_pretrained(model_id, export=True)
+
+url = "https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/sd_xl/castle_friedrich.png"
+image = load_image(url).convert("RGB")
+prompt = "medieval castle by Caspar David Friedrich"
+image = pipeline(prompt, image=image).images[0]
+# Don't forget to save your OpenVINO model so that you can load it without exporting it with `export=True`
+pipeline.save_pretrained("openvino-sd-xl-refiner-1.0")
+```
+
+
+### Refining the image output
+
+The image can be refined by making use of a model like [stabilityai/stable-diffusion-xl-refiner-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0). In this case, you only have to output the latents from the base model.
+
+
+```python
+from optimum.intel import OVStableDiffusionXLImg2ImgPipeline
+
+model_id = "stabilityai/stable-diffusion-xl-refiner-1.0"
+refiner = OVStableDiffusionXLImg2ImgPipeline.from_pretrained(model_id, export=True)
+
+image = base(prompt=prompt, output_type="latent").images[0]
+image = refiner(prompt=prompt, image=image[None, :]).images[0]
+```
+
+
+## Latent Consistency Models
+
+
+| Task                                 | Auto Class                           |
+|--------------------------------------|--------------------------------------|
+| `text-to-image`                      | `OVLatentConsistencyModelPipeline`   |
+
+
+### Text-to-Image
+
+Here is an example of how you can load a Latent Consistency Model (LCM) from [SimianLuo/LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) and run inference using OpenVINO :
+
+```python
+from optimum.intel import OVLatentConsistencyModelPipeline
+
+model_id = "SimianLuo/LCM_Dreamshaper_v7"
+pipeline = OVLatentConsistencyModelPipeline.from_pretrained(model_id, export=True)
+prompt = "sailing ship in storm by Leonardo da Vinci"
+images = pipeline(prompt, num_inference_steps=4, guidance_scale=8.0).images
+```
diff --git a/docs/source/openvino/tutorials/notebooks.mdx b/docs/source/openvino/tutorials/notebooks.mdx
new file mode 100644
index 0000000000..3608e95c92
--- /dev/null
+++ b/docs/source/openvino/tutorials/notebooks.mdx
@@ -0,0 +1,26 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Notebooks
+
+## Inference
+
+| Notebook                                                                                                                                                                                                   | Description                                                                                                                                                                     |                                                                                                                                                                                                                         |       |
+|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|------:|
+| [How to run inference with the OpenVINO](https://github.com/huggingface/optimum-intel/blob/main/notebooks/openvino/optimum_openvino_inference.ipynb)                                                       | Explains how to export your model to OpenVINO and to run inference with OpenVINO Runtime on various tasks                                                                       | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/optimum-intel/blob/main/notebooks/openvino/optimum_openvino_inference.ipynb)          | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/optimum-intel/blob/main/notebooks/openvino/optimum_openvino_inference.ipynb)          |
+
+## Quantization
+
+| Notebook                                                                                                                                                                                                   | Description                                                                                                                           |                                                                                                                                                                                                                         |       |
+|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|------:|
+| [How to quantize a question answering model with OpenVINO NNCF](https://github.com/huggingface/optimum-intel/blob/main/notebooks/openvino/question_answering_quantization.ipynb)                           | Show how to apply post-training quantization on a question answering model using [NNCF](https://github.com/openvinotoolkit/nncf)      | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/optimum-intel/blob/main/notebooks/openvino/question_answering_quantization.ipynb)     | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/optimum-intel/blob/main/notebooks/openvino/question_answering_quantization.ipynb)     |
+| [How to quantize Stable Diffusion model with OpenVINO NNCF](https://github.com/huggingface/optimum-intel/blob/main/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb)                          | Show how to apply post-training hybrid quantization on a Stable Diffusion model using [NNCF](https://github.com/openvinotoolkit/nncf) | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/optimum-intel/blob/main/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/optimum-intel/blob/main/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb)|
+| [Compare outputs of a quantized Stable Diffusion model with its full-precision counterpart](https://github.com/huggingface/optimum-intel/blob/main/notebooks/openvino/stable_diffusion_optimization.ipynb) | Show how to load and compare outputs from two Stable Diffusion models with different precision                                        | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/optimum-intel/blob/main/notebooks/openvino/stable_diffusion_optimization.ipynb)       | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/optimum-intel/blob/main/notebooks/openvino/stable_diffusion_optimization.ipynb)       |
+
+
diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 5adcb36495..2bdee32e17 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -102,7 +102,7 @@ def parse_args_openvino(parser: "ArgumentParser"):
         default=None,
         help=(
             "A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80%% of the layers will be quantized to int4 "
-            "while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 0.8."
+            "while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0."
         ),
     )
     optional_group.add_argument(
@@ -221,7 +221,7 @@ def parse_args(parser: "ArgumentParser"):
 
     def run(self):
         from ...exporters.openvino.__main__ import infer_task, main_export, maybe_convert_tokenizers
-        from ...intel.openvino.configuration import _DEFAULT_4BIT_CONFIGS, OVConfig
+        from ...intel.openvino.configuration import _DEFAULT_4BIT_CONFIG, _DEFAULT_4BIT_CONFIGS, OVConfig
 
         def _get_default_int4_config(model_id_or_path, library_name):
             if model_id_or_path in _DEFAULT_4BIT_CONFIGS:
@@ -233,13 +233,7 @@ def _get_default_int4_config(model_id_or_path, library_name):
                 if original_model_name in _DEFAULT_4BIT_CONFIGS:
                     return _DEFAULT_4BIT_CONFIGS[original_model_name]
 
-            return {
-                "bits": 4,
-                "ratio": 0.8,
-                "sym": False,
-                "group_size": None,
-                "all_layers": None,
-            }
+            return _DEFAULT_4BIT_CONFIG
 
         library_name = TasksManager.infer_library_from_model(self.args.model, library_name=self.args.library)
         if library_name == "sentence_transformers" and self.args.library is None:
@@ -283,7 +277,7 @@ def _get_default_int4_config(model_id_or_path, library_name):
             else:
                 quantization_config = {
                     "bits": 8 if is_int8 else 4,
-                    "ratio": 1 if is_int8 else (self.args.ratio or 0.8),
+                    "ratio": 1 if is_int8 else (self.args.ratio or _DEFAULT_4BIT_CONFIG["ratio"]),
                     "sym": self.args.sym or False,
                     "group_size": -1 if is_int8 else self.args.group_size,
                     "all_layers": None if is_int8 else self.args.all_layers,
diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py
index d1c6668b7e..0d43152889 100644
--- a/optimum/exporters/ipex/model_patcher.py
+++ b/optimum/exporters/ipex/model_patcher.py
@@ -12,29 +12,31 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+from transformers.models.bert.modeling_bert import BertIntermediate
 from transformers.models.llama.modeling_llama import (
     LlamaDecoderLayer,
     LlamaForCausalLM,
     LlamaModel,
     LlamaRMSNorm,
 )
+from transformers.models.vit.modeling_vit import ViTIntermediate
 
 from optimum.intel.utils.import_utils import is_ipex_version, is_transformers_version
 
 from .modeling_utils import (
     _IPEX_MINIMUM_VERSION_FOR_PATCHING,
+    _ipex_rms_layer_norm_forward,
+    _IPEXIntermediate,
     _IPEXLlamaDecoderLayer,
-    _llama_layer_norm_forward,
     _llama_model_forward,
 )
 
 
 # Please also update in the setup.py and .github/workflows/test_ipex.yml if you change the transformers version
 _TRANSFORMERS_MIN_VERSION = "4.39.0"
-_TRANSFORMERS_MAX_VERSION = "4.41.2"
+_TRANSFORMERS_MAX_VERSION = "4.42.3"
 
-_IPEX_EXPORTED_ARCH = ("LlamaForCausalLM",)
-_IPEX_EXPORTED_TASK = ("text-generation",)
+_IPEX_EXPORTED_GENERATION_TASKS = ("text-generation",)
 
 
 def convert_func(m, func_name, new_function):
@@ -49,7 +51,7 @@ def convert_functions(m, target_m, new_function_name, new_function):
         convert_functions(sub_m, target_m, new_function_name, new_function)
 
 
-def convert_class(m, target_m, new_class, config):
+def convert_class(m, target_m, new_class, config=None):
     for name, sub_m in m.named_children():
         if isinstance(sub_m, target_m):
             new_m = new_class(sub_m, config)
@@ -65,6 +67,23 @@ def patch_op(m, target_m, new_op_name, new_op):
 
 
 def _patch_llama_model(model):
+    convert_functions(model, LlamaModel, "forward", _llama_model_forward)
+    convert_functions(model, LlamaRMSNorm, "forward", _ipex_rms_layer_norm_forward)
+    convert_class(model, LlamaDecoderLayer, _IPEXLlamaDecoderLayer, model.config)
+    return model
+
+
+def _patch_bert_model(model):
+    convert_class(model, BertIntermediate, _IPEXIntermediate)
+    return model
+
+
+def _patch_vit_model(model):
+    convert_class(model, ViTIntermediate, _IPEXIntermediate)
+    return model
+
+
+def _patch_model(model):
     if is_ipex_version("<", _IPEX_MINIMUM_VERSION_FOR_PATCHING):
         raise ImportError(f"Only ipex version >= {_IPEX_MINIMUM_VERSION_FOR_PATCHING} supports llama model patching")
     if is_transformers_version("<", _TRANSFORMERS_MIN_VERSION) or is_transformers_version(
@@ -73,13 +92,10 @@ def _patch_llama_model(model):
         raise ImportError(
             f"Only transformers versions {_TRANSFORMERS_MIN_VERSION} ~ {_TRANSFORMERS_MAX_VERSION} are verified."
         )
-    convert_functions(model, LlamaModel, "forward", _llama_model_forward)
-    convert_functions(model, LlamaRMSNorm, "forward", _llama_layer_norm_forward)
-    convert_class(model, LlamaDecoderLayer, _IPEXLlamaDecoderLayer, model.config)
-    return model
-
-
-def _patch_model(model):
     if isinstance(model, LlamaForCausalLM):
         model = _patch_llama_model(model)
+    elif model.config.model_type == "bert":
+        model = _patch_bert_model(model)
+    elif model.config.model_type == "vit":
+        model = _patch_vit_model(model)
     return model
diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py
index 2086369665..2c74a42327 100644
--- a/optimum/exporters/ipex/modeling_utils.py
+++ b/optimum/exporters/ipex/modeling_utils.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+import logging
 import math
 from typing import List, Optional, Tuple, Union
 
@@ -25,11 +26,27 @@
 from optimum.intel.utils.modeling_utils import _setattr_from_module
 
 
+logger = logging.getLogger(__name__)
+
 _IPEX_MINIMUM_VERSION_FOR_PATCHING = "2.3.0"
 
 
+if is_ipex_version("<", _IPEX_MINIMUM_VERSION_FOR_PATCHING):
+    logger.warning(
+        f"Please upgrade the IPEX version to at least {_IPEX_MINIMUM_VERSION_FOR_PATCHING} if you want to patch the model."
+    )
+else:
+    from intel_extension_for_pytorch.llm.modules import (
+        IndirectAccessKVCacheAttention,
+        Linear2SiluMul,
+        LinearAdd,
+        LinearGelu,
+        RotaryEmbedding,
+    )
+
+
 # Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L83
-def _llama_layer_norm_forward(self, hidden_states):
+def _ipex_rms_layer_norm_forward(self, hidden_states):
     return torch.ops.torch_ipex.rmsnorm(hidden_states, self.weight, self.variance_epsilon)
 
 
@@ -139,14 +156,9 @@ def _llama_model_forward(
 # Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L321
 class _IPEXLlamaAttention(nn.Module):
     def __init__(self, module, config) -> None:
-        if is_ipex_version("<", _IPEX_MINIMUM_VERSION_FOR_PATCHING):
-            raise ImportError(
-                f"Only ipex version > {_IPEX_MINIMUM_VERSION_FOR_PATCHING} supports IndirectAccessKVCacheAttention, LinearAdd, RotaryEmbedding"
-            )
         super().__init__()
         _setattr_from_module(self, module)
         self.config = config
-        from intel_extension_for_pytorch.llm.modules import IndirectAccessKVCacheAttention, LinearAdd, RotaryEmbedding
 
         if module.o_proj.__class__.__name__ not in ["LinearAllreduce"]:
             self.mha_linear_add = LinearAdd(module.o_proj)
@@ -296,14 +308,9 @@ def forward(
 # Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L186
 class _IPEXLlamaMLP(nn.Module):
     def __init__(self, module, config) -> None:
-        if is_ipex_version("<", _IPEX_MINIMUM_VERSION_FOR_PATCHING):
-            raise ImportError(
-                f"Only ipex version > {_IPEX_MINIMUM_VERSION_FOR_PATCHING} supports Linear2SiluMul, LinearAdd"
-            )
         super().__init__()
         _setattr_from_module(self, module)
         self.config = config
-        from intel_extension_for_pytorch.llm.modules import Linear2SiluMul, LinearAdd
 
         # LinearAllreduce and LinearLayer cannot use fused op LinearAdd
         if module.down_proj.__class__.__name__ not in ["LinearAllreduce"]:
@@ -398,3 +405,16 @@ def forward(
             outputs += (present_key_value,)
 
         return outputs
+
+
+# Adapted from https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/bert/modeling_bert.py#L524
+class _IPEXIntermediate(nn.Module):
+    def __init__(self, module, config):
+        super().__init__()
+        _setattr_from_module(self, module)
+        self.linear_gelu = LinearGelu(module.dense)
+        del self.__dict__["_modules"]["dense"]
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.linear_gelu(hidden_states)
+        return hidden_states
diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index 927c98ac37..757244df55 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+import gc
 import logging
 import warnings
 from pathlib import Path
@@ -28,6 +29,8 @@
 from optimum.intel.utils.import_utils import is_openvino_tokenizers_available, is_transformers_version
 from optimum.utils.save_utils import maybe_load_preprocessors
 
+from .utils import clear_class_registry
+
 
 if TYPE_CHECKING:
     from optimum.intel.openvino.configuration import OVConfig
@@ -367,6 +370,10 @@ class StoreAttr(object):
     if convert_tokenizer:
         maybe_convert_tokenizers(library_name, output, model, preprocessors)
 
+    clear_class_registry()
+    del model
+    gc.collect()
+
     # Unpatch modules after GPTQ export
     if do_gptq_patching:
         torch.cuda.is_available = orig_cuda_check
diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py
index d0aabfb2d5..7812682b8b 100644
--- a/optimum/exporters/openvino/model_patcher.py
+++ b/optimum/exporters/openvino/model_patcher.py
@@ -1161,7 +1161,7 @@ def __exit__(self, exc_type, exc_value, traceback):
                 block.attention.forward = block.attention._orig_forward
 
 
-# Adapted from https://github.com/huggingface/transformers/blob/ccdabc5642bf84849af93f591e207dc625c8e1e1/src/transformers/models/phi3/modeling_phi3.py#L426
+# Adapted from https://github.com/huggingface/transformers/blob/ccdabc5642bf84849af93f591e207dc625c8e1e1/src/transformers/models/phi3/modeling_phi3.py#L729
 def _phi3_self_attn_sdpa_forward(
     self,
     hidden_states: torch.Tensor,
@@ -1170,6 +1170,7 @@ def _phi3_self_attn_sdpa_forward(
     past_key_value: Optional[Tuple[torch.Tensor]] = None,
     output_attentions: bool = False,
     use_cache: bool = False,
+    cache_position: Optional[torch.LongTensor] = None,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     if output_attentions:
         return self._orig_forward(
@@ -1181,10 +1182,9 @@ def _phi3_self_attn_sdpa_forward(
             use_cache=use_cache,
         )
 
-    # TO DO: remove llama imports when transformers with phi3 support will be released
-    try:
+    if is_transformers_version(">=", "4.41.0"):
         from transformers.models.phi3.modeling_phi3 import apply_rotary_pos_emb, repeat_kv
-    except ImportError:
+    else:
         from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
 
     bsz, q_len, _ = hidden_states.size()
@@ -1206,17 +1206,15 @@ def _phi3_self_attn_sdpa_forward(
     query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
     if past_key_value is not None:
-        cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+        cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
         key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
     key_states = repeat_kv(key_states, self.num_key_value_groups)
     value_states = repeat_kv(value_states, self.num_key_value_groups)
 
+    causal_mask = attention_mask
     if attention_mask is not None:
-        if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-            )
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
 
     # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
     # Reference: https://github.com/pytorch/pytorch/issues/112577.
@@ -1229,7 +1227,7 @@ def _phi3_self_attn_sdpa_forward(
         query_states,
         key_states,
         value_states,
-        attn_mask=attention_mask,
+        attn_mask=causal_mask,
         dropout_p=self.attention_dropout if self.training else 0.0,
         # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
         is_causal=self.is_causal and attention_mask is None and q_len > 1,
@@ -1561,7 +1559,7 @@ def __exit__(self, exc_type, exc_value, traceback):
                 layer.attn._attn = layer.attn._orig_attn
 
 
-# adapted from https://github.com/huggingface/transformers/blob/v4.40.2/src/transformers/models/dbrx/modeling_dbrx.py#L763
+# Adapted from https://github.com/huggingface/transformers/blob/v4.40.2/src/transformers/models/dbrx/modeling_dbrx.py#L763
 def _dbrx_experts_forward(
     self, x: torch.Tensor, weights: torch.Tensor, top_weights: torch.Tensor, top_experts: torch.LongTensor
 ):
@@ -1606,7 +1604,7 @@ def _dbrx_experts_forward(
     return out
 
 
-# adapted from https://github.com/huggingface/transformers/blob/v4.40.2/src/transformers/models/dbrx/modeling_dbrx.py#L1228
+# Adapted from https://github.com/huggingface/transformers/blob/v4.40.2/src/transformers/models/dbrx/modeling_dbrx.py#L1228
 def _dbrx_update_causal_mask_legacy(
     self, attention_mask: Optional[torch.Tensor], input_tensor: torch.Tensor, cache_position: torch.Tensor
 ) -> Optional[torch.Tensor]:
@@ -1803,6 +1801,7 @@ def __exit__(self, exc_type, exc_value, traceback):
             block.ffn.experts.forward = block.ffn.experts._orig_forward
 
 
+# Adapted from https://github.com/huggingface/transformers/blob/v4.41.0/src/transformers/models/persimmon/modeling_persimmon.py#L264
 def _persimmon_self_attn_sdpa_forward(
     self,
     hidden_states: torch.Tensor,
@@ -1811,6 +1810,7 @@ def _persimmon_self_attn_sdpa_forward(
     past_key_value: Optional["Cache"] = None,
     output_attentions: bool = False,
     use_cache: bool = False,
+    cache_position: Optional[torch.LongTensor] = None,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     from transformers.models.persimmon.modeling_persimmon import apply_rotary_pos_emb
 
@@ -1865,14 +1865,23 @@ def _persimmon_self_attn_sdpa_forward(
 
     if past_key_value is not None:
         # Specific to RoPE models with partial rotation
-        cache_kwargs = {"sin": sin, "cos": cos, "partial_rotation_size": self.rotary_emb.dim}
+        cache_kwargs = {
+            "sin": sin,
+            "cos": cos,
+            "partial_rotation_size": self.rotary_emb.dim,
+            "cache_position": cache_position,
+        }
         key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
+    causal_mask = attention_mask
+    if attention_mask is not None:  # no matter the length, we just slice it
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+
     attn_output = F.scaled_dot_product_attention(
         query_states,
         key_states,
         value_states,
-        attention_mask,
+        causal_mask,
         scale=1 / math.sqrt(self.head_dim),
         dropout_p=self.attention_dropout.p,
     )
diff --git a/optimum/intel/generation/modeling.py b/optimum/intel/generation/modeling.py
index 7d7e854311..22a4745f0c 100644
--- a/optimum/intel/generation/modeling.py
+++ b/optimum/intel/generation/modeling.py
@@ -90,6 +90,7 @@ class BaseModelForCausalLM(OptimizedModel, GenerationMixin):
     export_feature = "text-generation"
     main_input_name = "input_ids"
     base_model_prefix = "torch_script_model"
+    _supports_cache_class = False
 
     def __init__(
         self,
diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
index 9f4c0d1056..3d39e944b3 100644
--- a/optimum/intel/ipex/modeling_base.py
+++ b/optimum/intel/ipex/modeling_base.py
@@ -51,7 +51,11 @@
 from optimum.modeling_base import OptimizedModel
 from optimum.utils import NormalizedConfigManager
 
-from ...exporters.ipex.model_patcher import _IPEX_EXPORTED_TASK, _IPEX_MINIMUM_VERSION_FOR_PATCHING, _patch_model
+from ...exporters.ipex.model_patcher import (
+    _IPEX_EXPORTED_GENERATION_TASKS,
+    _IPEX_MINIMUM_VERSION_FOR_PATCHING,
+    _patch_model,
+)
 from ..generation.modeling import prepare_jit_inputs
 from ..utils.import_utils import is_ipex_version, is_torch_version, is_transformers_version
 from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS, recursive_to_device
@@ -60,7 +64,7 @@
 logger = logging.getLogger(__name__)
 
 
-_IPEX_SUPPORT_MODEL_TYPES = ("llama",)
+_IPEX_SUPPORT_MODEL_TYPES = ("llama", "bert", "vit")
 _IPEX_EXPORTED_GENERATION_METHODS = ("sample", "greedy_search", "beam_sample", "beam_search", "assisted_generation")
 
 
@@ -70,17 +74,22 @@ def _is_patched_with_ipex(model, task):
 
     if isinstance(model, torch.jit.ScriptModule):
         for node in model.graph.nodes():
-            # Jit will record the codes position so we can check if the node use ipex exporter.
-            if "torch_ipex::rotary_position_embedding" in node.__str__():
+            # Only patched model enabled fusion linear.
+            if "/fusions/" in node.__str__():
                 return True
         return False
-    else:
+    elif task in _IPEX_EXPORTED_GENERATION_TASKS and model.config.hidden_size < 64:
         # The ipex IAKV op in patched model requires the hidden size at least 64
-        return (
-            model.config.model_type in _IPEX_SUPPORT_MODEL_TYPES
-            and task in _IPEX_EXPORTED_TASK
-            and model.config.hidden_size >= 64
-        )
+        return False
+
+    return model.config.model_type in _IPEX_SUPPORT_MODEL_TYPES
+
+
+def _prepare_inputs_for_ipex_model(model, task, use_cache):
+    if task in _IPEX_EXPORTED_GENERATION_TASKS and _is_patched_with_ipex(model, task):
+        return get_dummy_input(model, return_dict=True)
+    else:
+        return prepare_jit_inputs(model, task, use_cache)
 
 
 def ipex_jit_trace(model, task, use_cache):
@@ -90,12 +99,8 @@ def ipex_jit_trace(model, task, use_cache):
 
     if _is_patched_with_ipex(model, task):
         model = _patch_model(model)
-        # TODO: integerate in prepare_jit_inputs.
-        sample_inputs = get_dummy_input(model, return_dict=True)
-        # Use Tensor Processing Primitives to accelerate linear, see https://arxiv.org/abs/2104.05755.
-        _enable_tpp()
-    else:
-        sample_inputs = prepare_jit_inputs(model, task, use_cache)
+
+    sample_inputs = _prepare_inputs_for_ipex_model(model, task, use_cache)
 
     model.config.return_dict = False
 
@@ -104,6 +109,8 @@ def ipex_jit_trace(model, task, use_cache):
         if not use_cache:
             sample_inputs.pop("past_key_values")
 
+    # Use Tensor Processing Primitives to accelerate linear, see https://arxiv.org/abs/2104.05755.
+    _enable_tpp()
     model = ipex.optimize(model.eval(), dtype=model.dtype, inplace=True)
     # Disable repack while jit tracing to reduce the memory
     ipex._C.disable_jit_linear_repack()
@@ -230,6 +237,7 @@ def _from_pretrained(
             model = TasksManager.get_model_from_task(
                 task,
                 model_id,
+                library_name="transformers",
                 trust_remote_code=trust_remote_code,
                 torch_dtype=torch_dtype,
                 _commit_hash=commit_hash,
@@ -265,6 +273,7 @@ def forward(
         input_ids: torch.Tensor,
         attention_mask: torch.Tensor,
         token_type_ids: torch.Tensor = None,
+        position_ids: torch.Tensor = None,
         **kwargs,
     ):
         inputs = {
@@ -275,6 +284,9 @@ def forward(
         if "token_type_ids" in self.input_names:
             inputs["token_type_ids"] = token_type_ids
 
+        if "position_ids" in self.input_names:
+            inputs["position_ids"] = position_ids
+
         outputs = self._call_model(**inputs)
         if isinstance(outputs, dict):
             model_output = ModelOutput(**outputs)
@@ -415,6 +427,8 @@ def forward(
 class IPEXModelForCausalLM(IPEXModel, GenerationMixin):
     auto_model_class = AutoModelForCausalLM
     export_feature = "text-generation"
+    _supports_cache_class = False
+    _is_stateful = False
 
     def __init__(
         self,
@@ -463,8 +477,8 @@ def __init__(
             else:
                 self._reorder_cache = self.model_cls._reorder_cache.__get__(self)
 
-        if is_transformers_version(">=", "4.38.0") and model_type in {"llama", "phi", "persimmon"}:
-            self.prepare_inputs_for_generation = _prepare_inputs_for_generation_for_llama
+        if is_transformers_version(">=", "4.38.0") and model_type in {"llama", "phi", "persimmon", "mistral"}:
+            self.prepare_inputs_for_generation = _ipex_prepare_inputs_for_generation
         else:
             self.prepare_inputs_for_generation = self.model_cls.prepare_inputs_for_generation.__get__(self)
 
@@ -600,7 +614,7 @@ def generate(self, *args, **kwargs):
         return super().generate(*args, **kwargs)
 
 
-def _prepare_inputs_for_generation_for_llama(
+def _ipex_prepare_inputs_for_generation(
     input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
 ):
     from transformers.cache_utils import Cache
diff --git a/optimum/intel/neural_compressor/modeling_base.py b/optimum/intel/neural_compressor/modeling_base.py
index bb3d2fe8c8..a12cfc84e5 100644
--- a/optimum/intel/neural_compressor/modeling_base.py
+++ b/optimum/intel/neural_compressor/modeling_base.py
@@ -71,6 +71,7 @@ class INCModel(OptimizedModel):
     auto_model_class = AutoModel
     export_feature = "feature-extraction"
     base_model_prefix = "inc_model"
+    _supports_cache_class = False
 
     def __init__(
         self,
diff --git a/optimum/intel/neural_compressor/trainer.py b/optimum/intel/neural_compressor/trainer.py
index f16031a22a..5e64ca862c 100644
--- a/optimum/intel/neural_compressor/trainer.py
+++ b/optimum/intel/neural_compressor/trainer.py
@@ -33,7 +33,6 @@
 import torch.distributed as dist
 from neural_compressor import training
 from neural_compressor.compression import DistillationCallbacks
-from neural_compressor.conf.pythonic_config import _BaseQuantizationConfig
 from packaging import version
 from torch import nn
 from torch.utils.data import Dataset, RandomSampler
@@ -99,6 +98,12 @@
     import torch_xla.core.xla_model as xm
 
 
+if is_neural_compressor_version("<", "2.6"):
+    from neural_compressor.conf.pythonic_config import _BaseQuantizationConfig
+else:
+    from neural_compressor.config import _BaseQuantizationConfig
+
+
 __version__ = "4.22.2"
 
 
@@ -659,7 +664,7 @@ def _inner_training_loop(
 
         return TrainOutput(self.state.global_step, train_loss, metrics)
 
-    def save_model(self, output_dir: Optional[str] = None):
+    def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False):
         """
         Will save the model, so you can reload it using `from_pretrained()`.
         Will only save from the main process.
@@ -670,6 +675,8 @@ def save_model(self, output_dir: Optional[str] = None):
         if self.args.should_save:
             self._save(output_dir=output_dir)
 
+        # TODO: push to hub if self.args.push_to_hub and not _internal_call
+
     def _save(self, output_dir=None, state_dict=None):
         # If we are executing this function, we are the process zero, so we don't check for that.
         output_dir = output_dir if output_dir is not None else self.args.output_dir
diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
index 7a6860a4df..5abb518183 100644
--- a/optimum/intel/openvino/configuration.py
+++ b/optimum/intel/openvino/configuration.py
@@ -70,6 +70,7 @@
     "THUDM/chatglm2-6b": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.72},
     "Qwen/Qwen-7B-Chat": {"bits": 4, "sym": True, "group_size": 128, "ratio": 0.6},
     "openlm-research/open_llama_3b": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True},
+    "openlm-research/open_llama_3b_v2": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True},
     "tiiuae/falcon-7b-instruct": {"bits": 4, "sym": True, "group_size": 64, "all_layers": True},
     "psmathur/orca_mini_3b": {
         "bits": 4,
@@ -103,6 +104,14 @@
     },
 }
 
+_DEFAULT_4BIT_CONFIG = {
+    "bits": 4,
+    "ratio": 1.0,
+    "sym": False,
+    "group_size": 128,
+    "all_layers": None,
+}
+
 
 class OVQuantizationMethod(str, Enum):
     DEFAULT = "default"
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index c33fcdd44e..9fbc73e856 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -50,6 +50,7 @@
 class OVBaseModel(OptimizedModel):
     auto_model_class = None
     export_feature = None
+    _supports_cache_class = False
 
     def __init__(
         self,
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 067b3e5d5d..4f8b26d934 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -40,7 +40,13 @@
 from ...exporters.openvino.stateful import model_has_state
 from ..utils.import_utils import is_nncf_available, is_transformers_version
 from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS
-from .configuration import _DEFAULT_4BIT_CONFIGS, OVConfig, OVWeightQuantizationConfig, _check_default_4bit_configs
+from .configuration import (
+    _DEFAULT_4BIT_CONFIG,
+    _DEFAULT_4BIT_CONFIGS,
+    OVConfig,
+    OVWeightQuantizationConfig,
+    _check_default_4bit_configs,
+)
 from .modeling import _TOKENIZER_FOR_DOC, INPUTS_DOCSTRING, MODEL_START_DOCSTRING, OVModel
 from .utils import ONNX_WEIGHTS_NAME, OV_TO_NP_TYPE, OV_XML_FILE_NAME, STR_TO_OV_TYPE
 
@@ -775,7 +781,7 @@ def _from_pretrained(
             init_cls = cls
 
         if isinstance(quantization_config, dict) and quantization_config == {"bits": 4}:
-            quantization_config = _DEFAULT_4BIT_CONFIGS.get(config.name_or_path, quantization_config)
+            quantization_config = _DEFAULT_4BIT_CONFIGS.get(config.name_or_path, _DEFAULT_4BIT_CONFIG)
         quantization_config = cls._prepare_weight_quantization_config(quantization_config, load_in_8bit)
 
         enable_compilation = kwargs.pop("compile", True) and not quantization_config
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index 1b880e736c..5013db482e 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -33,6 +33,7 @@
     StableDiffusionXLImg2ImgPipeline,
     StableDiffusionXLPipeline,
 )
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from diffusers.utils import CONFIG_NAME, is_invisible_watermark_available
 from huggingface_hub import snapshot_download
@@ -90,6 +91,7 @@ def __init__(
         tokenizer: Optional["CLIPTokenizer"] = None,
         tokenizer_2: Optional["CLIPTokenizer"] = None,
         feature_extractor: Optional["CLIPFeatureExtractor"] = None,
+        safety_checker: Optional["StableDiffusionSafetyChecker"] = None,
         device: str = "CPU",
         dynamic_shapes: bool = True,
         compile: bool = True,
@@ -135,7 +137,7 @@ def __init__(
         self.tokenizer_2 = tokenizer_2
         self.scheduler = scheduler
         self.feature_extractor = feature_extractor
-        self.safety_checker = None
+        self.safety_checker = safety_checker
         self.preprocessors = []
 
         if self.is_dynamic:
@@ -1082,6 +1084,22 @@ def __call__(
             **kwargs,
         )
 
+    def run_safety_checker(self, image: np.ndarray):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            # Transpose the image to NHWC
+            image = image.transpose(0, 2, 3, 1)
+
+            feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt")
+            image, has_nsfw_concept = self.safety_checker(images=image, clip_input=safety_checker_input.pixel_values)
+
+            # Transpose the image back to NCHW
+            image = image.transpose(0, 3, 1, 2)
+
+        return image, has_nsfw_concept
+
 
 def _raise_invalid_batch_size(
     expected_batch_size: int, batch_size: int, num_images_per_prompt: int, guidance_scale: float
diff --git a/setup.py b/setup.py
index c80dd1cf41..110df03015 100644
--- a/setup.py
+++ b/setup.py
@@ -28,8 +28,8 @@
 
 INSTALL_REQUIRE = [
     "torch>=1.11",
-    "transformers>=4.36.0,<4.42.0",
-    "optimum~=1.20",
+    "transformers>=4.36.0,<4.43.0",
+    "optimum>=1.21.2,<1.22.0",
     "datasets>=1.4.0",
     "sentencepiece",
     "setuptools",
@@ -56,13 +56,13 @@
     "sentence-transformers",
 ]
 
-QUALITY_REQUIRE = ["black~=23.1", "ruff>=0.0.241"]
+QUALITY_REQUIRE = ["black~=23.1", "ruff==0.4.4"]
 
 EXTRAS_REQUIRE = {
     "neural-compressor": ["neural-compressor>=2.2.0", "accelerate"],
     "openvino": ["openvino>=2023.3", "nncf>=2.11.0", "openvino-tokenizers[transformers]"],
     "nncf": ["nncf>=2.11.0"],
-    "ipex": ["intel-extension-for-pytorch", "transformers>=4.39.0,<=4.41.2"],
+    "ipex": ["intel-extension-for-pytorch", "transformers>=4.39.0,<=4.42.3"],
     "diffusers": ["diffusers"],
     "quality": QUALITY_REQUIRE,
     "tests": TESTS_REQUIRE,
diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py
index b6635ca154..98c3150f04 100644
--- a/tests/ipex/test_modeling.py
+++ b/tests/ipex/test_modeling.py
@@ -188,6 +188,21 @@ def test_pipeline(self, model_arch):
         self.assertGreaterEqual(outputs["score"], 0.0)
         self.assertIsInstance(outputs["answer"], str)
 
+    @unittest.skipIf(is_ipex_version("<", "2.3.0"), reason="Only ipex version > 2.3.0 supports ipex model patching")
+    def test_patched_model(self):
+        ipex_model = IPEXModelForQuestionAnswering.from_pretrained(
+            "Jiqing/patched_tiny_random_bert_for_question_answering"
+        )
+        transformers_model = AutoModelForQuestionAnswering.from_pretrained("hf-internal-testing/tiny-random-bert")
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert")
+        inputs = "This is a sample input"
+        tokens = tokenizer(inputs, return_tensors="pt")
+        with torch.no_grad():
+            transformers_outputs = transformers_model(**tokens)
+        outputs = ipex_model(**tokens)
+        self.assertTrue(torch.allclose(outputs.start_logits, transformers_outputs.start_logits, atol=1e-4))
+        self.assertTrue(torch.allclose(outputs.end_logits, transformers_outputs.end_logits, atol=1e-4))
+
 
 class IPEXModelForCausalLMTest(unittest.TestCase):
     IPEX_MODEL_CLASS = IPEXModelForCausalLM
@@ -458,3 +473,18 @@ def test_pipeline(self, model_arch):
         self.assertEqual(pipe.device, model.device)
         self.assertGreaterEqual(outputs[0]["score"], 0.0)
         self.assertTrue(isinstance(outputs[0]["label"], str))
+
+    @unittest.skipIf(is_ipex_version("<", "2.3.0"), reason="Only ipex version > 2.3.0 supports ipex model patching")
+    def test_patched_model(self):
+        ipex_model = IPEXModelForImageClassification.from_pretrained(
+            "Jiqing/patched_tiny_random_vit_for_image_classification"
+        )
+        transformers_model = self.IPEX_MODEL_CLASS.from_pretrained("hf-internal-testing/tiny-random-vit")
+        preprocessor = AutoFeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-vit")
+        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        image = Image.open(requests.get(url, stream=True).raw)
+        inputs = preprocessor(images=image, return_tensors="pt")
+        with torch.no_grad():
+            transformers_outputs = transformers_model(**inputs)
+        outputs = ipex_model(**inputs)
+        self.assertTrue(torch.allclose(outputs.logits, transformers_outputs.logits, atol=1e-4))
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index 2df03f6b34..b1b6643186 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -86,10 +86,10 @@ class OVCLIExportTestCase(unittest.TestCase):
     )
 
     TEST_4BIT_CONFIGURATONS = [
-        ("text-generation-with-past", "opt125m", "int4_sym_g128", 62, 86),
-        ("text-generation-with-past", "opt125m", "int4_asym_g128", 62, 86),
-        ("text-generation-with-past", "opt125m", "int4_sym_g64", 62, 86),
-        ("text-generation-with-past", "opt125m", "int4_asym_g64", 62, 86),
+        ("text-generation-with-past", "opt125m", "int4_sym_g128", 4, 144),
+        ("text-generation-with-past", "opt125m", "int4_asym_g128", 4, 144),
+        ("text-generation-with-past", "opt125m", "int4_sym_g64", 4, 144),
+        ("text-generation-with-past", "opt125m", "int4_asym_g64", 4, 144),
         ("text-generation-with-past", "llama_awq", "int4 --ratio 1.0 --sym --group-size 16 --all-layers", 0, 32),
         (
             "text-generation-with-past",
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index c7a381a0e2..bdc1e9afee 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -697,7 +697,6 @@ def test_compare_to_transformers(self, model_arch):
         ov_model = OVModelForCausalLM.from_pretrained(model_id, export=True, ov_config=F32_CONFIG, **model_kwargs)
         self.assertIsInstance(ov_model.config, PretrainedConfig)
         self.assertTrue(ov_model.use_cache)
-        self.assertEqual(ov_model.stateful, ov_model.config.model_type not in not_stateful)
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS)
         tokens = tokenizer("This is a sample output", return_tensors="pt")
         tokens.pop("token_type_ids", None)
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index 67970fbbcb..1f71c9a9ed 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -75,8 +75,8 @@
 
 class OVQuantizerTest(unittest.TestCase):
     SUPPORTED_ARCHITECTURES_TORCH_MODEL = (
-        (OVModelForSequenceClassification, "bert", 22, 35),
-        (OVModelForCausalLM, "gpt2", 41, 3),
+        (OVModelForSequenceClassification, "bert", 32 if is_transformers_version("<", "4.41.0") else 22, 35),
+        (OVModelForCausalLM, "gpt2", 41 if is_transformers_version("<", "4.42.0") else 21, 3),
     )
     SUPPORTED_ARCHITECTURES_OV_MODEL = (
         (OVModelForSequenceClassification, "bert", 32, 35),
@@ -90,9 +90,6 @@ def test_automodel_static_quantization(self, model_cls, model_name, expected_fak
         dataset_name, dataset_config_name, column_name = _TASK_TO_DATASET[task]
         file_name = "openvino_quantized_model.xml"
 
-        if model_name == "bert" and is_transformers_version("<", "4.41.0"):
-            expected_fake_quantize = 32
-
         def preprocess_function(examples, tokenizer):
             return tokenizer(examples[column_name], padding="max_length", max_length=128, truncation=True)
 
diff --git a/tests/openvino/test_stable_diffusion.py b/tests/openvino/test_stable_diffusion.py
index e735a07fb4..d8e8e76030 100644
--- a/tests/openvino/test_stable_diffusion.py
+++ b/tests/openvino/test_stable_diffusion.py
@@ -555,3 +555,51 @@ def test_num_images_per_prompt_static_model(self, model_arch: str):
             inputs = _generate_inputs(batch_size)
             outputs = pipeline(**inputs, num_images_per_prompt=num_images, height=_height, width=width).images
             self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3))
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @unittest.skipIf(is_diffusers_version("<=", "0.21.4"), "not supported with this diffusers version")
+    def test_safety_checker(self, model_arch: str):
+        ov_pipeline = self.MODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True, ov_config=F32_CONFIG)
+        self.assertIsInstance(ov_pipeline.text_encoder, OVModelTextEncoder)
+        self.assertIsInstance(ov_pipeline.vae_encoder, OVModelVaeEncoder)
+        self.assertIsInstance(ov_pipeline.vae_decoder, OVModelVaeDecoder)
+        self.assertIsInstance(ov_pipeline.unet, OVModelUnet)
+        self.assertIsInstance(ov_pipeline.config, Dict)
+
+        from diffusers import LatentConsistencyModelPipeline
+        from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
+
+        safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
+        pipeline = LatentConsistencyModelPipeline.from_pretrained(
+            MODEL_NAMES[model_arch], safety_checker=safety_checker
+        )
+
+        batch_size, num_images_per_prompt, height, width = 2, 3, 64, 128
+        latents = ov_pipeline.prepare_latents(
+            batch_size * num_images_per_prompt,
+            ov_pipeline.unet.config["in_channels"],
+            height,
+            width,
+            dtype=np.float32,
+            generator=np.random.RandomState(0),
+        )
+
+        kwargs = {
+            "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size,
+            "num_inference_steps": 1,
+            "num_images_per_prompt": num_images_per_prompt,
+            "height": height,
+            "width": width,
+            "guidance_scale": 8.5,
+        }
+
+        for output_type in ["latent", "np"]:
+            ov_outputs = ov_pipeline(latents=latents, output_type=output_type, **kwargs).images
+            self.assertIsInstance(ov_outputs, np.ndarray)
+            with torch.no_grad():
+                outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images
+
+            # Compare model outputs
+            self.assertTrue(np.allclose(ov_outputs, outputs, atol=1e-4))
+        # Compare model devices
+        self.assertEqual(pipeline.device.type, ov_pipeline.device)
diff --git a/tests/openvino/test_training.py b/tests/openvino/test_training.py
index 375fc6e4a1..9e85274454 100644
--- a/tests/openvino/test_training.py
+++ b/tests/openvino/test_training.py
@@ -45,6 +45,7 @@
 from transformers.testing_utils import slow
 from transformers.trainer_utils import EvalPrediction, TrainOutput
 from transformers.utils import WEIGHTS_NAME
+from utils_tests import MODEL_NAMES
 
 from optimum.intel.openvino import OVTrainingArguments
 from optimum.intel.openvino.configuration import OVConfig
@@ -317,56 +318,56 @@ def tearDown(self):
 # TODO: Uncomment failes tests after NNCF 2.8.1 patch release
 OVTRAINER_TEXT_CLASSIFICATION_TEST_DESCRIPTORS = {
     "distillation": OVTrainerTestDescriptor(
-        model_id="hf-internal-testing/tiny-random-bert",
-        teacher_model_id="hf-internal-testing/tiny-random-bert",
+        model_id=MODEL_NAMES["bert"],
+        teacher_model_id=MODEL_NAMES["bert"],
         nncf_compression_config=[],
         compression_metrics=["compression_loss", "distillation_loss", "task_loss"],
     ),
     "default_quantization": OVTrainerTestDescriptor(
-        model_id="hf-internal-testing/tiny-random-bert",
+        model_id=MODEL_NAMES["bert"],
         nncf_compression_config=DEFAULT_QUANTIZATION_CONFIG,
         expected_fake_quantize=22,
         expected_int8=32,
         compression_metrics=["compression_loss"],
     ),
     "distillation,default_quantization": OVTrainerTestDescriptor(
-        model_id="hf-internal-testing/tiny-random-bert",
-        teacher_model_id="hf-internal-testing/tiny-random-bert",
+        model_id=MODEL_NAMES["bert"],
+        teacher_model_id=MODEL_NAMES["bert"],
         nncf_compression_config=DEFAULT_QUANTIZATION_CONFIG,
         expected_fake_quantize=22,
         expected_int8=32,
         compression_metrics=["compression_loss", "distillation_loss", "task_loss"],
     ),
     "customized_quantization": OVTrainerTestDescriptor(
-        model_id="hf-internal-testing/tiny-random-bert",
+        model_id=MODEL_NAMES["bert"],
         nncf_compression_config=CUSTOMIZED_QUANTIZATION_CONFIG,
         expected_fake_quantize=22,
         expected_int8=32,
         compression_metrics=["compression_loss"],
     ),
     "distillation,customized_quantization": OVTrainerTestDescriptor(
-        model_id="hf-internal-testing/tiny-random-bert",
-        teacher_model_id="hf-internal-testing/tiny-random-bert",
+        model_id=MODEL_NAMES["bert"],
+        teacher_model_id=MODEL_NAMES["bert"],
         nncf_compression_config=CUSTOMIZED_QUANTIZATION_CONFIG,
         expected_fake_quantize=22,
         expected_int8=32,
         compression_metrics=["compression_loss", "distillation_loss", "task_loss"],
     ),
     "structured_movement_sparsity": OVTrainerTestDescriptor(
-        model_id="hf-internal-testing/tiny-random-bert",
+        model_id=MODEL_NAMES["bert"],
         nncf_compression_config=STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT,
         expected_binary_masks=60,
         compression_metrics=["compression_loss"],
     ),
     "distillation,structured_movement_sparsity": OVTrainerTestDescriptor(
-        model_id="hf-internal-testing/tiny-random-bert",
-        teacher_model_id="hf-internal-testing/tiny-random-bert",
+        model_id=MODEL_NAMES["bert"],
+        teacher_model_id=MODEL_NAMES["bert"],
         nncf_compression_config=STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT,
         expected_binary_masks=60,
         compression_metrics=["compression_loss", "distillation_loss", "task_loss"],
     ),
     "default_quantization,structured_movement_sparsity": OVTrainerTestDescriptor(
-        model_id="hf-internal-testing/tiny-random-bert",
+        model_id=MODEL_NAMES["bert"],
         nncf_compression_config=[DEFAULT_QUANTIZATION_CONFIG, STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT],
         expected_fake_quantize=22,
         expected_int8=32,
@@ -374,7 +375,7 @@ def tearDown(self):
         compression_metrics=["compression_loss"],
     ),
     "customized_quantization,structured_movement_sparsity": OVTrainerTestDescriptor(
-        model_id="hf-internal-testing/tiny-random-bert",
+        model_id=MODEL_NAMES["bert"],
         nncf_compression_config=[
             CUSTOMIZED_QUANTIZATION_CONFIG,
             STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT,
@@ -385,8 +386,8 @@ def tearDown(self):
         compression_metrics=["compression_loss"],
     ),
     "distillation,default_quantization,structured_movement_sparsity": OVTrainerTestDescriptor(
-        model_id="hf-internal-testing/tiny-random-bert",
-        teacher_model_id="hf-internal-testing/tiny-random-bert",
+        model_id=MODEL_NAMES["bert"],
+        teacher_model_id=MODEL_NAMES["bert"],
         nncf_compression_config=[DEFAULT_QUANTIZATION_CONFIG, STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT],
         expected_fake_quantize=22,
         expected_int8=32,
@@ -394,8 +395,8 @@ def tearDown(self):
         compression_metrics=["compression_loss", "distillation_loss", "task_loss"],
     ),
     "distillation,customized_quantization,structured_movement_sparsity": OVTrainerTestDescriptor(
-        model_id="hf-internal-testing/tiny-random-bert",
-        teacher_model_id="hf-internal-testing/tiny-random-bert",
+        model_id=MODEL_NAMES["bert"],
+        teacher_model_id=MODEL_NAMES["bert"],
         nncf_compression_config=[
             CUSTOMIZED_QUANTIZATION_CONFIG,
             STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT,
@@ -406,20 +407,20 @@ def tearDown(self):
         compression_metrics=["compression_loss", "distillation_loss", "task_loss"],
     ),
     "unstructured_movement_sparsity": OVTrainerTestDescriptor(
-        model_id="hf-internal-testing/tiny-random-bert",
+        model_id=MODEL_NAMES["bert"],
         nncf_compression_config=UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT,
         expected_binary_masks=60,
         compression_metrics=["compression_loss"],
     ),
     "distillation,unstructured_movement_sparsity": OVTrainerTestDescriptor(
-        model_id="hf-internal-testing/tiny-random-bert",
-        teacher_model_id="hf-internal-testing/tiny-random-bert",
+        model_id=MODEL_NAMES["bert"],
+        teacher_model_id=MODEL_NAMES["bert"],
         nncf_compression_config=UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT,
         expected_binary_masks=60,
         compression_metrics=["compression_loss", "distillation_loss", "task_loss"],
     ),
     "default_quantization,unstructured_movement_sparsity": OVTrainerTestDescriptor(
-        model_id="hf-internal-testing/tiny-random-bert",
+        model_id=MODEL_NAMES["bert"],
         nncf_compression_config=[DEFAULT_QUANTIZATION_CONFIG, UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT],
         expected_fake_quantize=22,
         expected_int8=32,
@@ -427,7 +428,7 @@ def tearDown(self):
         compression_metrics=["compression_loss"],
     ),
     "customized_quantization,unstructured_movement_sparsity": OVTrainerTestDescriptor(
-        model_id="hf-internal-testing/tiny-random-bert",
+        model_id=MODEL_NAMES["bert"],
         nncf_compression_config=[
             CUSTOMIZED_QUANTIZATION_CONFIG,
             UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT,
@@ -438,8 +439,8 @@ def tearDown(self):
         compression_metrics=["compression_loss"],
     ),
     "distillation,default_quantization,unstructured_movement_sparsity": OVTrainerTestDescriptor(
-        model_id="hf-internal-testing/tiny-random-bert",
-        teacher_model_id="hf-internal-testing/tiny-random-bert",
+        model_id=MODEL_NAMES["bert"],
+        teacher_model_id=MODEL_NAMES["bert"],
         nncf_compression_config=[DEFAULT_QUANTIZATION_CONFIG, UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT],
         expected_fake_quantize=22,
         expected_int8=32,
@@ -447,8 +448,8 @@ def tearDown(self):
         compression_metrics=["compression_loss", "distillation_loss", "task_loss"],
     ),
     "distillation,customized_quantization,unstructured_movement_sparsity": OVTrainerTestDescriptor(
-        model_id="hf-internal-testing/tiny-random-bert",
-        teacher_model_id="hf-internal-testing/tiny-random-bert",
+        model_id=MODEL_NAMES["bert"],
+        teacher_model_id=MODEL_NAMES["bert"],
         nncf_compression_config=[
             CUSTOMIZED_QUANTIZATION_CONFIG,
             UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_BERT,
@@ -552,62 +553,62 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel):
 }
 UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_SWIN = deepcopy(STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_SWIN)
 UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_SWIN["params"]["enable_structured_masking"] = False
-
 OVTRAINER_IMAGE_CLASSIFICATION_TEST_DESCRIPTORS = {
     "default_quantization": OVTrainerTestDescriptor(
-        model_id="yujiepan/tiny-random-swin-patch4-window7-224",
+        model_id=MODEL_NAMES["swin"],
         nncf_compression_config=DEFAULT_QUANTIZATION_CONFIG,
-        expected_fake_quantize=36,
-        expected_int8=28,
+        expected_fake_quantize=35,
+        expected_int8=27,
         compression_metrics=["compression_loss"],
     ),
     "structured_movement_sparsity": OVTrainerTestDescriptor(
-        model_id="yujiepan/tiny-random-swin-patch4-window7-224",
+        model_id=MODEL_NAMES["swin"],
         nncf_compression_config=STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_SWIN,
         expected_binary_masks=48,
         compression_metrics=["compression_loss"],
     ),
     "unstructured_movement_sparsity": OVTrainerTestDescriptor(
-        model_id="yujiepan/tiny-random-swin-patch4-window7-224",
+        model_id=MODEL_NAMES["swin"],
         nncf_compression_config=UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_SWIN,
         expected_binary_masks=48,
         compression_metrics=["compression_loss"],
     ),
     "default_quantization,structured_movement_sparsity": OVTrainerTestDescriptor(
-        model_id="yujiepan/tiny-random-swin-patch4-window7-224",
+        model_id=MODEL_NAMES["swin"],
         nncf_compression_config=[STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_SWIN, DEFAULT_QUANTIZATION_CONFIG],
-        expected_fake_quantize=36,
-        expected_int8=28,
+        expected_fake_quantize=35,
+        expected_int8=27,
         expected_binary_masks=48,
         compression_metrics=["compression_loss"],
     ),
     "default_quantization,unstructured_movement_sparsity": OVTrainerTestDescriptor(
-        model_id="yujiepan/tiny-random-swin-patch4-window7-224",
+        model_id=MODEL_NAMES["swin"],
         nncf_compression_config=[UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_SWIN, DEFAULT_QUANTIZATION_CONFIG],
-        expected_fake_quantize=36,
-        expected_int8=28,
+        expected_fake_quantize=35,
+        expected_int8=27,
         expected_binary_masks=48,
         compression_metrics=["compression_loss"],
     ),
     "distillation,default_quantization,structured_movement_sparsity": OVTrainerTestDescriptor(
-        model_id="yujiepan/tiny-random-swin-patch4-window7-224",
-        teacher_model_id="yujiepan/tiny-random-swin-patch4-window7-224",
+        model_id=MODEL_NAMES["swin"],
+        teacher_model_id=MODEL_NAMES["swin"],
         nncf_compression_config=[STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_SWIN, DEFAULT_QUANTIZATION_CONFIG],
-        expected_fake_quantize=36,
-        expected_int8=28,
+        expected_fake_quantize=35,
+        expected_int8=27,
         expected_binary_masks=48,
         compression_metrics=["compression_loss", "distillation_loss", "task_loss"],
     ),
     "distillation,default_quantization,unstructured_movement_sparsity": OVTrainerTestDescriptor(
-        model_id="yujiepan/tiny-random-swin-patch4-window7-224",
-        teacher_model_id="yujiepan/tiny-random-swin-patch4-window7-224",
+        model_id=MODEL_NAMES["swin"],
+        teacher_model_id=MODEL_NAMES["swin"],
         nncf_compression_config=[UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_SWIN, DEFAULT_QUANTIZATION_CONFIG],
-        expected_fake_quantize=36,
-        expected_int8=28,
+        expected_fake_quantize=35,
+        expected_int8=27,
         expected_binary_masks=48,
         compression_metrics=["compression_loss", "distillation_loss", "task_loss"],
     ),
 }
+# TODO : can be moved to MODEL_NAMES["swin-window"] after transformers v4.42.3
 
 
 class OVTrainerImageClassificationTrainingTest(OVTrainerBaseTrainingTest):
@@ -735,26 +736,26 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel):
 
 OVTRAINER_AUDIO_CLASSIFICATION_TEST_DESCRIPTORS = {
     "quantization": OVTrainerTestDescriptor(
-        model_id="hf-internal-testing/tiny-random-Wav2Vec2Model",
+        model_id=MODEL_NAMES["wav2vec2-hf"],
         nncf_compression_config=[QUANTIZATION_CONFIG_FOR_WAV2VEC2],
         expected_fake_quantize=40,
         expected_int8=30,
         compression_metrics=["compression_loss"],
     ),
     "structured_movement_sparsity": OVTrainerTestDescriptor(
-        model_id="hf-internal-testing/tiny-random-Wav2Vec2Model",
+        model_id=MODEL_NAMES["wav2vec2-hf"],
         nncf_compression_config=[STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_WAV2VEC2],
         expected_binary_masks=48,
         compression_metrics=["compression_loss"],
     ),
     "unstructured_movement_sparsity": OVTrainerTestDescriptor(
-        model_id="hf-internal-testing/tiny-random-Wav2Vec2Model",
+        model_id=MODEL_NAMES["wav2vec2-hf"],
         nncf_compression_config=[UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_WAV2VEC2],
         expected_binary_masks=48,
         compression_metrics=["compression_loss"],
     ),
     "quantization,structured_movement_sparsity": OVTrainerTestDescriptor(
-        model_id="hf-internal-testing/tiny-random-Wav2Vec2Model",
+        model_id=MODEL_NAMES["wav2vec2-hf"],
         nncf_compression_config=[QUANTIZATION_CONFIG_FOR_WAV2VEC2, STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_WAV2VEC2],
         expected_fake_quantize=40,
         expected_int8=30,
@@ -762,7 +763,7 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel):
         compression_metrics=["compression_loss"],
     ),
     "quantization,unstructured_movement_sparsity": OVTrainerTestDescriptor(
-        model_id="hf-internal-testing/tiny-random-Wav2Vec2Model",
+        model_id=MODEL_NAMES["wav2vec2-hf"],
         nncf_compression_config=[QUANTIZATION_CONFIG_FOR_WAV2VEC2, UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_WAV2VEC2],
         expected_fake_quantize=40,
         expected_int8=30,
@@ -770,8 +771,8 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel):
         compression_metrics=["compression_loss"],
     ),
     "distillation,quantization,structured_movement_sparsity": OVTrainerTestDescriptor(
-        model_id="hf-internal-testing/tiny-random-Wav2Vec2Model",
-        teacher_model_id="hf-internal-testing/tiny-random-Wav2Vec2Model",
+        model_id=MODEL_NAMES["wav2vec2-hf"],
+        teacher_model_id=MODEL_NAMES["wav2vec2-hf"],
         nncf_compression_config=[QUANTIZATION_CONFIG_FOR_WAV2VEC2, STRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_WAV2VEC2],
         expected_fake_quantize=40,
         expected_int8=30,
@@ -779,8 +780,8 @@ def check_ovmodel_reshaping(self, ovmodel: OVModel):
         compression_metrics=["compression_loss", "distillation_loss", "task_loss"],
     ),
     "distillation,quantization,unstructured_movement_sparsity": OVTrainerTestDescriptor(
-        model_id="hf-internal-testing/tiny-random-Wav2Vec2Model",
-        teacher_model_id="hf-internal-testing/tiny-random-Wav2Vec2Model",
+        model_id=MODEL_NAMES["wav2vec2-hf"],
+        teacher_model_id=MODEL_NAMES["wav2vec2-hf"],
         nncf_compression_config=[QUANTIZATION_CONFIG_FOR_WAV2VEC2, UNSTRUCTURED_MOVEMENT_SPARSITY_CONFIG_FOR_WAV2VEC2],
         expected_fake_quantize=40,
         expected_int8=30,
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 590beefb36..1f9b051d7d 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -118,6 +118,7 @@
     "sew_d": "asapp/sew-d-tiny-100k-ft-ls100h",
     "arctic": "katuni4ka/tiny-random-snowflake",
     "swin": "hf-internal-testing/tiny-random-SwinModel",
+    "swin-window": "yujiepan/tiny-random-swin-patch4-window7-224",
     "t5": "hf-internal-testing/tiny-random-t5",
     "trocr": "microsoft/trocr-small-handwritten",
     "unispeech": "hf-internal-testing/tiny-random-unispeech",