From 87da8b0eb2e99f7e80df2079e5956b19212b7c1b Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Tue, 24 Oct 2023 15:36:09 +0200
Subject: [PATCH 1/9] Add timm version requirement (#461)

---
 optimum/intel/openvino/modeling.py  | 7 ++++++-
 optimum/intel/utils/import_utils.py | 9 +++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/optimum/intel/openvino/modeling.py b/optimum/intel/openvino/modeling.py
index a0c753d94d..8cea5eb7b6 100644
--- a/optimum/intel/openvino/modeling.py
+++ b/optimum/intel/openvino/modeling.py
@@ -49,7 +49,7 @@
 
 from optimum.exporters import TasksManager
 
-from ..utils.import_utils import is_timm_available
+from ..utils.import_utils import is_timm_available, is_timm_version
 from .modeling_base import OVBaseModel
 from .utils import _is_timm_ov_dir
 
@@ -540,6 +540,11 @@ def from_pretrained(
                     "To load a timm model, timm needs to be installed. Please install it with `pip install timm`."
                 )
 
+            if is_timm_version("<", "0.9.0"):
+                raise ImportError(
+                    "To load a timm model, please make sure to upgrade your `timm` version to at least 0.9.0, you can upgrade it by running `pip install --upgrade timm`"
+                )
+
             from .modeling_timm import TimmConfig, TimmForImageClassification, TimmOnnxConfig
 
             config = TimmConfig.from_pretrained(model_id, **kwargs)
diff --git a/optimum/intel/utils/import_utils.py b/optimum/intel/utils/import_utils.py
index d781f1f513..87ddcc1315 100644
--- a/optimum/intel/utils/import_utils.py
+++ b/optimum/intel/utils/import_utils.py
@@ -220,6 +220,15 @@ def is_ipex_version(operation: str, version: str):
     return compare_versions(parse(_ipex_version), operation, version)
 
 
+def is_timm_version(operation: str, version: str):
+    """
+    Compare the current timm version to a given reference with an operation.
+    """
+    if not _timm_available:
+        return False
+    return compare_versions(parse(_timm_version), operation, version)
+
+
 DIFFUSERS_IMPORT_ERROR = """
 {0} requires the diffusers library but it was not found in your environment. You can install it with pip:
 `pip install diffusers`. Please note that you may need to restart your runtime after installation.

From 532051246fb76dac2e312510412446dcd8888b26 Mon Sep 17 00:00:00 2001
From: Helena Kloosterman <helena.kloosterman@intel.com>
Date: Tue, 31 Oct 2023 17:57:43 +0100
Subject: [PATCH 2/9] Do not automatically cache models in temp dirs (#462)

* Do not automatically cache models in temp dirs

* Fix for Python 3.8

Path.is_relative_to() was added in Python 3.9

* Disable speedup test for CausalLM with pkv

Speedup is small on the Github Actions runner hardware so this test regularly
fails even with a speedup threshold of only 1.1

* Copy ov_config for seq2seq models
---
 optimum/intel/openvino/modeling_base.py      |  8 ++--
 optimum/intel/openvino/modeling_diffusion.py |  8 ++--
 optimum/intel/openvino/modeling_seq2seq.py   | 38 +++++++++--------
 tests/openvino/test_modeling.py              | 45 +++++++++++---------
 4 files changed, 52 insertions(+), 47 deletions(-)

diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index 9384477eb9..eb1ed88467 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -15,7 +15,7 @@
 import logging
 import os
 from pathlib import Path
-from tempfile import TemporaryDirectory
+from tempfile import TemporaryDirectory, gettempdir
 from typing import Dict, Optional, Union
 
 import openvino
@@ -339,11 +339,11 @@ def compile(self):
         if self.request is None:
             logger.info(f"Compiling the model to {self._device} ...")
             ov_config = {**self.ov_config}
-            if "CACHE_DIR" not in self.ov_config.keys():
-                # Set default CACHE_DIR only if it is not set.
+            if "CACHE_DIR" not in self.ov_config.keys() and not str(self.model_save_dir).startswith(gettempdir()):
+                # Set default CACHE_DIR only if it is not set, and if the model is not in a temporary directory
                 cache_dir = Path(self.model_save_dir).joinpath("model_cache")
                 ov_config["CACHE_DIR"] = str(cache_dir)
-                logger.info(f"Set CACHE_DIR to {str(cache_dir)}")
+                logger.info(f"Setting OpenVINO CACHE_DIR to {str(cache_dir)}")
             self.request = core.compile_model(self.model, self._device, ov_config)
 
     def _reshape(
diff --git a/optimum/intel/openvino/modeling_diffusion.py b/optimum/intel/openvino/modeling_diffusion.py
index 1ca0b93643..2707260606 100644
--- a/optimum/intel/openvino/modeling_diffusion.py
+++ b/optimum/intel/openvino/modeling_diffusion.py
@@ -17,7 +17,7 @@
 import os
 import shutil
 from pathlib import Path
-from tempfile import TemporaryDirectory
+from tempfile import TemporaryDirectory, gettempdir
 from typing import Any, Dict, List, Optional, Union
 
 import numpy as np
@@ -539,10 +539,8 @@ def __init__(
         self._model_dir = Path(model_dir or parent_model._model_save_dir)
         config_path = self._model_dir / model_name / self.CONFIG_NAME
         self.config = self.parent_model._dict_from_json_file(config_path) if config_path.is_file() else {}
-
-        # TODO : disable if self._model_dir tmp directory
-        if "CACHE_DIR" not in self.ov_config:
-            self.ov_config["CACHE_DIR"] = os.path.join(self._model_dir, self._model_name)
+        if "CACHE_DIR" not in self.ov_config.keys() and not str(self._model_dir).startswith(gettempdir()):
+            self.ov_config["CACHE_DIR"] = os.path.join(self._model_dir, self._model_name, "model_cache")
 
     def _compile(self):
         if self.request is None:
diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py
index 87cd18d875..6b759054d0 100644
--- a/optimum/intel/openvino/modeling_seq2seq.py
+++ b/optimum/intel/openvino/modeling_seq2seq.py
@@ -14,6 +14,7 @@
 
 import logging
 from pathlib import Path
+from tempfile import gettempdir
 from typing import Dict, Optional, Tuple
 
 import numpy as np
@@ -202,31 +203,32 @@ def __init__(
         self.decoder_with_past = None
         enable_compilation = kwargs.get("compile", True)
         encoder_cache_dir = Path(self.model_save_dir).joinpath("encoder_cache")
-        encoder_cache_dir.mkdir(parents=True, exist_ok=True)
-        ov_encoder_config = (
-            {**self.ov_config}
-            if "CACHE_DIR" in self.ov_config.keys()
-            else {**self.ov_config, "CACHE_DIR": str(encoder_cache_dir)}
-        )
+        ov_encoder_config = {**self.ov_config}
+
+        if "CACHE_DIR" not in ov_encoder_config.keys() and not str(self.model_save_dir).startswith(gettempdir()):
+            ov_encoder_config["CACHE_DIR"] = str(encoder_cache_dir)
+
         self.encoder = OVEncoder(
             self.encoder_model, self._device, ov_encoder_config, main_input_name=self.main_input_name
         )
+
         decoder_cache_dir = Path(self.model_save_dir).joinpath("decoder_cache")
-        decoder_cache_dir.mkdir(parents=True, exist_ok=True)
-        ov_decoder_config = (
-            {**self.ov_config}
-            if "CACHE_DIR" in self.ov_config.keys()
-            else {**self.ov_config, "CACHE_DIR": str(decoder_cache_dir)}
-        )
+        ov_decoder_config = {**self.ov_config}
+
+        if "CACHE_DIR" not in ov_decoder_config.keys() and not str(self.model_save_dir).startswith(gettempdir()):
+            ov_decoder_config["CACHE_DIR"] = str(decoder_cache_dir)
+
         self.decoder = OVDecoder(self.decoder_model, self._device, ov_decoder_config)
+
         if self.use_cache:
             decoder_past_cache_dir = Path(self.model_save_dir).joinpath("decoder_past_cache")
-            decoder_past_cache_dir.mkdir(parents=True, exist_ok=True)
-            ov_decoder_past_config = (
-                {**self.ov_config}
-                if "CACHE_DIR" in self.ov_config.keys()
-                else {**self.ov_config, "CACHE_DIR": str(decoder_past_cache_dir)}
-            )
+            ov_decoder_past_config = {**self.ov_config}
+
+            if "CACHE_DIR" not in ov_decoder_past_config.keys() and not str(self.model_save_dir).startswith(
+                gettempdir()
+            ):
+                ov_decoder_past_config["CACHE_DIR"] = str(decoder_past_cache_dir)
+
             self.decoder_with_past = OVDecoder(self.decoder_with_past_model, self._device, ov_decoder_past_config)
         if enable_compilation:
             self.compile()
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index bf1a007844..f3978b2965 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -111,6 +111,19 @@ def test_load_from_hub_and_save_model(self):
         self.assertIsInstance(loaded_model.config, PretrainedConfig)
         loaded_model_outputs = loaded_model(**tokens)
 
+        # Test that model caching is automatically enabled
+        openvino_cache_dir = loaded_model.model_save_dir / "model_cache"
+        self.assertTrue(openvino_cache_dir.is_dir())
+        self.assertGreaterEqual(len(list(openvino_cache_dir.glob("*.blob"))), 1)
+
+        # Test specifying ov_config with throughput hint and manual cache dir
+        manual_openvino_cache_dir = loaded_model.model_save_dir / "manual_model_cache"
+        ov_config = {"CACHE_DIR": str(manual_openvino_cache_dir), "PERFORMANCE_HINT": "THROUGHPUT"}
+        loaded_model = OVModelForSequenceClassification.from_pretrained(self.OV_MODEL_ID, ov_config=ov_config)
+        self.assertTrue(manual_openvino_cache_dir.is_dir())
+        self.assertGreaterEqual(len(list(manual_openvino_cache_dir.glob("*.blob"))), 1)
+        self.assertEqual(loaded_model.request.get_property("PERFORMANCE_HINT").name, "THROUGHPUT")
+
         with tempfile.TemporaryDirectory() as tmpdirname:
             loaded_model.save_pretrained(tmpdirname)
             folder_contents = os.listdir(tmpdirname)
@@ -120,6 +133,7 @@ def test_load_from_hub_and_save_model(self):
 
         outputs = model(**tokens)
         self.assertTrue(torch.equal(loaded_model_outputs.logits, outputs.logits))
+
         del loaded_model
         del model
         gc.collect()
@@ -276,6 +290,10 @@ def test_pipeline(self, model_arch):
             self.assertTrue(not model.is_dynamic)
             self.assertGreaterEqual(outputs[0]["score"], 0.0)
             self.assertIsInstance(outputs[0]["label"], str)
+            # Test that model caching was not automatically enabled for exported model
+            openvino_cache_dir = model.model_save_dir / "model_cache"
+            self.assertFalse(openvino_cache_dir.is_dir())
+
         del model
         del pipe
         gc.collect()
@@ -466,7 +484,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase):
         "pegasus",
     )
     GENERATION_LENGTH = 100
-    SPEEDUP_CACHE = 1.1
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_compare_to_transformers(self, model_arch):
@@ -539,29 +556,17 @@ def test_compare_with_and_without_past_key_values(self):
         tokens = tokenizer("This is a sample input", return_tensors="pt")
 
         model_with_pkv = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=True)
-        # Warmup
-        _ = model_with_pkv.generate(**tokens)
-        with Timer() as with_pkv_timer:
-            outputs_model_with_pkv = model_with_pkv.generate(
-                **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
-            )
-
+        outputs_model_with_pkv = model_with_pkv.generate(
+            **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
+        )
         model_without_pkv = OVModelForCausalLM.from_pretrained(model_id, export=True, use_cache=False)
-
-        # Warmup
-        _ = model_without_pkv.generate(**tokens)
-        with Timer() as without_pkv_timer:
-            outputs_model_without_pkv = model_without_pkv.generate(
-                **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
-            )
+        outputs_model_without_pkv = model_without_pkv.generate(
+            **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
+        )
         self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv))
         self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH)
         self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH)
-        self.assertTrue(
-            without_pkv_timer.elapsed / with_pkv_timer.elapsed > self.SPEEDUP_CACHE,
-            f"With pkv latency: {with_pkv_timer.elapsed:.3f} ms, without pkv latency: {without_pkv_timer.elapsed:.3f} ms,"
-            f" speedup: {without_pkv_timer.elapsed / with_pkv_timer.elapsed:.3f}",
-        )
+
         del model_with_pkv
         del model_without_pkv
         gc.collect()

From 6c179be44a758f99e808c8038bed7a7b05d52cd0 Mon Sep 17 00:00:00 2001
From: Helena Kloosterman <helena.kloosterman@intel.com>
Date: Thu, 2 Nov 2023 11:30:45 +0100
Subject: [PATCH 3/9] Allow openvino-nightly (#466)

---
 .github/workflows/test_openvino.yml | 6 ++++++
 optimum/intel/utils/import_utils.py | 5 ++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test_openvino.yml b/.github/workflows/test_openvino.yml
index cb58f412a6..d43cabe323 100644
--- a/.github/workflows/test_openvino.yml
+++ b/.github/workflows/test_openvino.yml
@@ -36,3 +36,9 @@ jobs:
     - name: Test with Pytest
       run: |
         pytest tests/openvino/ --ignore test_modeling_basic
+    - name: Test openvino-nightly import
+      run: |
+        pip uninstall -y openvino
+        pip install openvino-nightly
+        python -c "from optimum.intel import OVModelForCausalLM; OVModelForCausalLM.from_pretrained('hf-internal-testing/tiny-random-gpt2', export=True, compile=False)"
+
diff --git a/optimum/intel/utils/import_utils.py b/optimum/intel/utils/import_utils.py
index 87ddcc1315..d15780384f 100644
--- a/optimum/intel/utils/import_utils.py
+++ b/optimum/intel/utils/import_utils.py
@@ -71,7 +71,10 @@
     try:
         _openvino_version = importlib_metadata.version("openvino")
     except importlib_metadata.PackageNotFoundError:
-        _openvino_available = False
+        try:
+            _openvino_version = importlib_metadata.version("openvino-nightly")
+        except importlib_metadata.PackageNotFoundError:
+            _openvino_available = False
 
 
 _nncf_available = importlib.util.find_spec("nncf") is not None

From ae36dda88f60e4f24836059793c9e9914fe31fed Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Thu, 2 Nov 2023 14:33:36 +0400
Subject: [PATCH 4/9] move gptq patching to export stage (#465)

---
 optimum/exporters/openvino/__main__.py     | 42 +++++++++++++++++++++-
 optimum/intel/openvino/modeling_decoder.py | 35 +-----------------
 2 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index 782aa0bc0d..b17d93aa5e 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -18,7 +18,7 @@
 from typing import Any, Callable, Dict, Optional, Union
 
 from requests.exceptions import ConnectionError as RequestsConnectionError
-from transformers import AutoTokenizer
+from transformers import AutoConfig, AutoTokenizer
 
 from optimum.exporters import TasksManager
 from optimum.exporters.onnx import __main__ as optimum_main
@@ -137,6 +137,41 @@ def main_export(
     original_task = task
     task = TasksManager.map_from_synonym(task)
 
+    # Patch the modules to export of GPTQ models w/o GPU
+    do_gptq_patching = False
+    try:
+        config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code)
+        config_dict = config.to_dict()
+        quantization_config = config_dict.get("quantization_config", None)
+        do_gptq_patching = quantization_config and quantization_config["quant_method"] == "gptq"
+    except Exception:
+        pass
+
+    if do_gptq_patching:
+        import torch
+
+        torch.set_default_dtype(torch.float32)
+        orig_cuda_check = torch.cuda.is_available
+        torch.cuda.is_available = lambda: True
+
+        from optimum.gptq import GPTQQuantizer
+
+        orig_post_init_model = GPTQQuantizer.post_init_model
+
+        def post_init_model(self, model):
+            from auto_gptq import exllama_set_max_input_length
+
+            class StoreAttr(object):
+                pass
+
+            model.quantize_config = StoreAttr()
+            model.quantize_config.desc_act = self.desc_act
+            if self.desc_act and not self.disable_exllama and self.max_input_length is not None:
+                model = exllama_set_max_input_length(model, self.max_input_length)
+            return model
+
+        GPTQQuantizer.post_init_model = post_init_model
+
     framework = TasksManager.determine_framework(model_name_or_path, subfolder=subfolder, framework=framework)
 
     # get the shapes to be used to generate dummy inputs
@@ -324,3 +359,8 @@ def main_export(
         int8=int8,
         model_kwargs=model_kwargs,
     )
+
+    # Unpatch modules after GPTQ export
+    if do_gptq_patching:
+        torch.cuda.is_available = orig_cuda_check
+        GPTQQuantizer.post_init_model = orig_post_init_model
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 0e018f9f62..4d87b7eec2 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -229,34 +229,6 @@ def _from_transformers(
             if use_cache:
                 task = task + "-with-past"
 
-        # Patch the modules to export of GPTQ models w/o GPU
-        do_gptq_patching = False
-        config_dict = config.to_dict()
-        quantization_config = config_dict.get("quantization_config", None)
-        do_gptq_patching = quantization_config and quantization_config["quant_method"] == "gptq"
-        if do_gptq_patching:
-            torch.set_default_dtype(torch.float32)
-            orig_cuda_check = torch.cuda.is_available
-            torch.cuda.is_available = lambda: True
-
-            from optimum.gptq import GPTQQuantizer
-
-            orig_post_init_model = GPTQQuantizer.post_init_model
-
-            def post_init_model(self, model):
-                from auto_gptq import exllama_set_max_input_length
-
-                class StoreAttr(object):
-                    pass
-
-                model.quantize_config = StoreAttr()
-                model.quantize_config.desc_act = self.desc_act
-                if self.desc_act and not self.disable_exllama and self.max_input_length is not None:
-                    model = exllama_set_max_input_length(model, self.max_input_length)
-                return model
-
-            GPTQQuantizer.post_init_model = post_init_model
-
         main_export(
             model_name_or_path=model_id,
             output=save_dir_path,
@@ -271,11 +243,6 @@ class StoreAttr(object):
             int8=load_in_8bit,
         )
 
-        # Unpatch modules after GPTQ export
-        if do_gptq_patching:
-            torch.cuda.is_available = orig_cuda_check
-            GPTQQuantizer.post_init_model = orig_post_init_model
-
         config.is_decoder = True
         config.is_encoder_decoder = False
         config.save_pretrained(save_dir_path)
@@ -504,7 +471,7 @@ def _from_pretrained(
         elif model_type == "gpt-bigcode":
             init_cls = OVGPTBigCodeForCausalLM
         else:
-            init_cls = OVModelForCausalLM
+            init_cls = cls
 
         return init_cls(model=model, config=config, model_save_dir=model_cache_path.parent, **kwargs)
 

From 59f63a52763571e6b43b088304a8d799ec103a6c Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Fri, 3 Nov 2023 19:00:00 +0100
Subject: [PATCH 5/9] Fix transformers v4.35.0 compatibility (#471)

* fix trainer

* fix

* format

* fix version
---
 .github/workflows/test_inc.yml                |   3 +-
 .../intel/neural_compressor/quantization.py   |  60 +++-
 optimum/intel/neural_compressor/trainer.py    | 321 +++++++++++------
 optimum/intel/openvino/trainer.py             | 332 ++++++++++++------
 setup.py                                      |   5 +-
 tests/neural_compressor/test_modeling.py      |   3 +
 tests/neural_compressor/test_onnx.py          |   2 +-
 7 files changed, 508 insertions(+), 218 deletions(-)

diff --git a/.github/workflows/test_inc.yml b/.github/workflows/test_inc.yml
index fd5fd16509..3a15214f99 100644
--- a/.github/workflows/test_inc.yml
+++ b/.github/workflows/test_inc.yml
@@ -30,7 +30,8 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install .[neural-compressor,ipex,diffusers,tests]
+        pip install .[neural-compressor,diffusers,tests]
+        pip install intel-extension-for-pytorch
     - name: Test with Pytest
       run: |
         pytest tests/neural_compressor/
diff --git a/optimum/intel/neural_compressor/quantization.py b/optimum/intel/neural_compressor/quantization.py
index 36f16524c2..d4846adc15 100644
--- a/optimum/intel/neural_compressor/quantization.py
+++ b/optimum/intel/neural_compressor/quantization.py
@@ -15,6 +15,7 @@
 import copy
 import inspect
 import logging
+import warnings
 from enum import Enum
 from itertools import chain
 from pathlib import Path
@@ -30,16 +31,25 @@
 from neural_compressor.quantization import fit
 from torch.utils.data import DataLoader, RandomSampler
 from transformers import (
+    AutoModelForCausalLM,
+    AutoModelForMaskedLM,
+    AutoModelForMultipleChoice,
+    AutoModelForQuestionAnswering,
+    AutoModelForSeq2SeqLM,
+    AutoModelForSequenceClassification,
+    AutoModelForTokenClassification,
+    AutoModelForVision2Seq,
     DataCollator,
     PretrainedConfig,
     PreTrainedModel,
+    XLNetLMHeadModel,
     default_data_collator,
 )
 
 from optimum.exporters import TasksManager
 from optimum.exporters.onnx import OnnxConfig
 from optimum.onnxruntime import ORTModel
-from optimum.onnxruntime.modeling_decoder import ORTModelDecoder
+from optimum.onnxruntime.modeling_decoder import ORTModelForCausalLM
 from optimum.onnxruntime.modeling_seq2seq import ORTModelForConditionalGeneration
 from optimum.onnxruntime.utils import ONNX_DECODER_NAME
 from optimum.quantization_base import OptimumQuantizer
@@ -256,7 +266,7 @@ def quantize(
             if isinstance(self._original_model, ORTModelForConditionalGeneration):
                 raise RuntimeError("ORTModelForConditionalGeneration not supported for quantization")
 
-            if isinstance(self._original_model, ORTModelDecoder):
+            if isinstance(self._original_model, ORTModelForCausalLM):
                 model_or_path = self._original_model.onnx_paths
                 if len(model_or_path) > 1:
                     raise RuntimeError(
@@ -528,3 +538,49 @@ def _apply_quantization_from_config(q_config: Dict, model: torch.nn.Module) -> t
         q_model = convert(q_model, mapping=q_mapping, inplace=True)
 
     return q_model
+
+
+class IncQuantizedModel(INCModel):
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        warnings.warn(
+            f"The class `{cls.__name__}` has been depreciated and will be removed in optimum-intel v1.12, please use "
+            f"`{cls.__name__.replace('IncQuantized', 'INC')}` instead."
+        )
+        return super().from_pretrained(*args, **kwargs)
+
+
+class IncQuantizedModelForQuestionAnswering(IncQuantizedModel):
+    auto_model_class = AutoModelForQuestionAnswering
+
+
+class IncQuantizedModelForSequenceClassification(IncQuantizedModel):
+    auto_model_class = AutoModelForSequenceClassification
+
+
+class IncQuantizedModelForTokenClassification(IncQuantizedModel):
+    auto_model_class = AutoModelForTokenClassification
+
+
+class IncQuantizedModelForMultipleChoice(IncQuantizedModel):
+    auto_model_class = AutoModelForMultipleChoice
+
+
+class IncQuantizedModelForSeq2SeqLM(IncQuantizedModel):
+    auto_model_class = AutoModelForSeq2SeqLM
+
+
+class IncQuantizedModelForCausalLM(IncQuantizedModel):
+    auto_model_class = AutoModelForCausalLM
+
+
+class IncQuantizedModelForMaskedLM(IncQuantizedModel):
+    auto_model_class = AutoModelForMaskedLM
+
+
+class IncQuantizedModelForXLNetLM(IncQuantizedModel):
+    auto_model_class = XLNetLMHeadModel
+
+
+class IncQuantizedModelForVision2Seq(IncQuantizedModel):
+    auto_model_class = AutoModelForVision2Seq
diff --git a/optimum/intel/neural_compressor/trainer.py b/optimum/intel/neural_compressor/trainer.py
index 8e8fec1758..918a2e4885 100644
--- a/optimum/intel/neural_compressor/trainer.py
+++ b/optimum/intel/neural_compressor/trainer.py
@@ -15,12 +15,21 @@
 import copy
 import math
 import os
+import shutil
 import sys
 import time
 from collections.abc import Mapping
 from itertools import chain
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
+
+# Integrations must be imported before ML frameworks:
+# isort: off
+from transformers.integrations import hp_params
+from transformers.integrations.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_available
+
+# isort: on
+
 import datasets
 import torch
 import torch.distributed as dist
@@ -28,38 +37,35 @@
 from neural_compressor.compression import DistillationCallbacks
 from neural_compressor.conf.pythonic_config import _BaseQuantizationConfig
 from neural_compressor.experimental.export import torch_to_fp32_onnx, torch_to_int8_onnx
-
-# from packaging import version
+from packaging import version
 from torch import nn
 from torch.utils.data import Dataset, RandomSampler
-from torch.utils.data.dataloader import DataLoader
-from torch.utils.data.distributed import DistributedSampler
-from tqdm.auto import tqdm
 from transformers import Trainer
 from transformers.data.data_collator import DataCollator
 from transformers.debug_utils import DebugOption, DebugUnderflowOverflow
-from transformers.deepspeed import deepspeed_init
-from transformers.file_utils import WEIGHTS_NAME
-
-# Integrations must be imported before ML frameworks:
-from transformers.integrations import hp_params
 from transformers.modeling_utils import PreTrainedModel, get_parameter_dtype, unwrap_model
 from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 from transformers.pytorch_utils import is_torch_less_than_1_11
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 from transformers.trainer import TRAINER_STATE_NAME
 from transformers.trainer_callback import TrainerCallback, TrainerState
-from transformers.trainer_pt_utils import IterableDatasetShard
+from transformers.trainer_pt_utils import get_dataloader_sampler, get_model_param_count
 from transformers.trainer_utils import (
     EvalPrediction,
     HPSearchBackend,
-    ShardedDDPOption,
     TrainOutput,
     has_length,
     speed_metrics,
 )
-from transformers.training_args import TrainingArguments
-from transformers.utils import is_apex_available, is_sagemaker_mp_enabled, logging
+from transformers.training_args import ParallelMode, TrainingArguments
+from transformers.utils import (
+    WEIGHTS_NAME,
+    is_accelerate_available,
+    is_apex_available,
+    is_sagemaker_mp_enabled,
+    is_torch_tpu_available,
+    logging,
+)
 
 from optimum.exporters import TasksManager
 
@@ -68,12 +74,31 @@
 from .configuration import INCConfig
 
 
+if is_accelerate_available():
+    from accelerate import __version__ as accelerate_version
+    from accelerate import skip_first_batches
+
+    if version.parse(accelerate_version) > version.parse("0.20.3"):
+        pass
+    DATA_SAMPLERS = [RandomSampler]
+    if version.parse(accelerate_version) > version.parse("0.23.0"):
+        from accelerate.data_loader import SeedableRandomSampler
+
+        DATA_SAMPLERS += [SeedableRandomSampler]
+
+    if is_deepspeed_available():
+        pass
+
+
 if is_apex_available():
     from apex import amp
 
 if is_sagemaker_mp_enabled():
     import smdistributed.modelparallel.torch as smp
 
+if is_torch_tpu_available(check_device=False):
+    import torch_xla.core.xla_model as xm
+
 
 if TYPE_CHECKING:
     from optimum.exporters.onnx import OnnxConfig
@@ -109,6 +134,8 @@ def __init__(
         task: Optional[str] = None,
         save_onnx_model: bool = False,
     ):
+        self.neftune_noise_alpha = None
+
         super().__init__(
             model,
             args,
@@ -178,7 +205,9 @@ def __init__(
     def _inner_training_loop(
         self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None
     ):
+        self.accelerator.free_memory()
         self._train_batch_size = batch_size
+        logger.debug(f"Currently training with a batch size of: {self._train_batch_size}")
         # Data loader and number of training steps
         train_dataloader = self.get_train_dataloader()
 
@@ -186,9 +215,10 @@ def _inner_training_loop(
         # number of training epochs: num_train_epochs
         # number of training steps per epoch: num_update_steps_per_epoch
         # total number of training steps to execute: max_steps
-        total_train_batch_size = args.train_batch_size * args.gradient_accumulation_steps * args.world_size
+        total_train_batch_size = self._train_batch_size * args.gradient_accumulation_steps * args.world_size
 
         len_dataloader = None
+        num_train_tokens = None
         if has_length(train_dataloader):
             len_dataloader = len(train_dataloader)
             num_update_steps_per_epoch = len_dataloader // args.gradient_accumulation_steps
@@ -230,58 +260,106 @@ def _inner_training_loop(
             else:
                 debug_overflow = DebugUnderflowOverflow(self.model)  # noqa
 
-        delay_optimizer_creation = (
-            self.sharded_ddp is not None
-            and self.sharded_ddp != ShardedDDPOption.SIMPLE
-            or is_sagemaker_mp_enabled()
-            or self.fsdp is not None
-        )
-        if args.deepspeed:
-            deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
-                self, num_training_steps=max_steps, resume_from_checkpoint=resume_from_checkpoint
-            )
-            self.model = deepspeed_engine.module
-            self.model_wrapped = deepspeed_engine
-            self.deepspeed = deepspeed_engine
-            self.optimizer = optimizer
-            self.lr_scheduler = lr_scheduler
-        elif not delay_optimizer_creation:
+        delay_optimizer_creation = is_sagemaker_mp_enabled() or self.fsdp is not None or self.is_fsdp_enabled
+
+        if self.is_deepspeed_enabled:
+            self.optimizer, self.lr_scheduler = deepspeed_init(self, num_training_steps=max_steps)
+
+        if not delay_optimizer_creation:
             self.create_optimizer_and_scheduler(num_training_steps=max_steps)
 
         self.state = TrainerState()
         self.state.is_hyper_param_search = trial is not None
 
+        # Compute absolute values for logging, eval, and save if given as ratio
+        if args.logging_steps is not None:
+            if args.logging_steps < 1:
+                self.state.logging_steps = math.ceil(max_steps * args.logging_steps)
+            else:
+                self.state.logging_steps = args.logging_steps
+        if args.eval_steps is not None:
+            if args.eval_steps < 1:
+                self.state.eval_steps = math.ceil(max_steps * args.eval_steps)
+            else:
+                self.state.eval_steps = args.eval_steps
+        if args.save_steps is not None:
+            if args.save_steps < 1:
+                self.state.save_steps = math.ceil(max_steps * args.save_steps)
+            else:
+                self.state.save_steps = args.save_steps
+
         # Activate gradient checkpointing if needed
         if args.gradient_checkpointing:
-            self.model.gradient_checkpointing_enable()
+            if args.gradient_checkpointing_kwargs is None:
+                gradient_checkpointing_kwargs = {}
+            else:
+                gradient_checkpointing_kwargs = args.gradient_checkpointing_kwargs
+
+            self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs)
 
         model = self._wrap_model(self.model_wrapped)
 
-        if is_sagemaker_mp_enabled() and resume_from_checkpoint is not None:
-            self._load_from_checkpoint(resume_from_checkpoint, model)
+        # as the model is wrapped, don't use `accelerator.prepare`
+        # this is for unhandled cases such as
+        # FSDP-XLA, SageMaker MP/DP, DataParallel, IPEX
+        use_accelerator_prepare = True if model is self.model else False
+
+        if delay_optimizer_creation:
+            if use_accelerator_prepare:
+                self.model = self.accelerator.prepare(self.model)
+            self.create_optimizer_and_scheduler(num_training_steps=max_steps)
+
+        # prepare using `accelerator` prepare
+        if use_accelerator_prepare:
+            self.model.train()
+            if hasattr(self.lr_scheduler, "step"):
+                if self.use_apex:
+                    model = self.accelerator.prepare(self.model)
+                else:
+                    model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
+            else:
+                # to handle cases wherein we pass "DummyScheduler" such as when it is specified in DeepSpeed config.
+                model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
+                    self.model, self.optimizer, self.lr_scheduler
+                )
+
+        if self.is_fsdp_enabled:
+            self.model = self.model_wrapped = model
 
         # for the rest of this function `model` is the outside model, whether it was wrapped or not
         if model is not self.model:
             self.model_wrapped = model
 
-        if delay_optimizer_creation:
-            self.create_optimizer_and_scheduler(num_training_steps=max_steps)
+        # backward compatibility
+        if self.is_deepspeed_enabled:
+            self.deepspeed = self.model_wrapped
+
+        # ckpt loading
+        if resume_from_checkpoint is not None:
+            if self.is_deepspeed_enabled:
+                deepspeed_load_checkpoint(self.model_wrapped, resume_from_checkpoint)
+            elif is_sagemaker_mp_enabled() or self.is_fsdp_enabled:
+                self._load_from_checkpoint(resume_from_checkpoint, self.model_wrapped)
 
         # Check if saved optimizer or scheduler states exist
         self._load_optimizer_and_scheduler(resume_from_checkpoint)
 
         # important: at this point:
         # self.model         is the Transformers Model
-        # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model), etc.
+        # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model),
+        # FSDP(Transformers Model), Dynamo Optimized Module(Transformers Model) etc.
 
         # Train!
         logger.info("***** Running training *****")
-        logger.info(f"  Num examples = {num_examples}")
-        logger.info(f"  Num Epochs = {num_train_epochs}")
-        logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
-        logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size}")
+        logger.info(f"  Num examples = {num_examples:,}")
+        logger.info(f"  Num Epochs = {num_train_epochs:,}")
+        logger.info(f"  Instantaneous batch size per device = {self.args.per_device_train_batch_size:,}")
+        if self.args.per_device_train_batch_size != self._train_batch_size:
+            logger.info(f"  Training with DataParallel so batch size has been adjusted to: {self._train_batch_size:,}")
+        logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}")
         logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
-        logger.info(f"  Total optimization steps = {max_steps}")
+        logger.info(f"  Total optimization steps = {max_steps:,}")
+        logger.info(f"  Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}")
 
         self.state.epoch = 0
         start_time = time.time()
@@ -306,20 +384,19 @@ def _inner_training_loop(
             logger.info(f"  Continuing training from global step {self.state.global_step}")
             if not args.ignore_data_skip:
                 logger.info(
-                    f"  Will skip the first {epochs_trained} epochs then the first {steps_trained_in_current_epoch} "
-                    "batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` "
-                    "flag to your launch command, but you will resume the training on data already seen by your model."
+                    f"  Will skip the first {epochs_trained} epochs then the first"
+                    f" {steps_trained_in_current_epoch} batches in the first epoch."
                 )
-                if self.is_local_process_zero() and not args.disable_tqdm:
-                    steps_trained_progress_bar = tqdm(total=steps_trained_in_current_epoch)
-                    steps_trained_progress_bar.set_description("Skipping the first batches")
 
         # Update the references
         self.callback_handler.model = self.model
         self.callback_handler.optimizer = self.optimizer
         self.callback_handler.lr_scheduler = self.lr_scheduler
         self.callback_handler.train_dataloader = train_dataloader
-        self.state.trial_name = self.hp_name(trial) if self.hp_name is not None else None
+        if self.hp_name is not None and self._trial is not None:
+            # use self._trial because the SigOpt/Optuna hpo only call `_hp_search_setup(trial)` instead of passing trial
+            # parameter to Train when using DDP.
+            self.state.trial_name = self.hp_name(self._trial)
         if trial is not None:
             assignments = trial.assignments if self.hp_search_backend == HPSearchBackend.SIGOPT else trial
             self.state.trial_params = hp_params(assignments)
@@ -347,26 +424,26 @@ def _inner_training_loop(
         # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point.
         if not args.ignore_data_skip:
             for epoch in range(epochs_trained):
-                is_random_sampler = hasattr(train_dataloader, "sampler") and isinstance(
-                    train_dataloader.sampler, RandomSampler
-                )
+                sampler = get_dataloader_sampler(train_dataloader)
+                sampler_kinds = [RandomSampler]
+                if version.parse(accelerate_version) > version.parse("0.23.0"):
+                    sampler_kinds.append(SeedableRandomSampler)
+                is_random_sampler = isinstance(sampler, tuple(sampler_kinds))
                 if is_torch_less_than_1_11 or not is_random_sampler:
                     # We just need to begin an iteration to create the randomization of the sampler.
-                    # That was before PyTorch 1.11 however...
                     for _ in train_dataloader:
                         break
                 else:
                     # Otherwise we need to call the whooooole sampler cause there is some random operation added
                     # AT THE VERY END!
-                    _ = list(train_dataloader.sampler)
+                    sampler = sampler if sampler is not None else []
+                    _ = list(sampler)
 
+        total_batched_samples = 0
         for epoch in range(epochs_trained, num_train_epochs):
-            if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler):
-                train_dataloader.sampler.set_epoch(epoch)
-            elif hasattr(train_dataloader, "dataset") and isinstance(train_dataloader.dataset, IterableDatasetShard):
-                train_dataloader.dataset.set_epoch(epoch)
-
             epoch_iterator = train_dataloader
+            if hasattr(epoch_iterator, "set_epoch"):
+                epoch_iterator.set_epoch(epoch)
 
             # Reset the past mems state at the beginning of each epoch if necessary.
             if args.past_index >= 0:
@@ -385,8 +462,21 @@ def _inner_training_loop(
             if epoch == epochs_trained and resume_from_checkpoint is not None and steps_trained_in_current_epoch == 0:
                 self._load_rng_state(resume_from_checkpoint)
 
+            rng_to_sync = False
+            steps_skipped = 0
+            if steps_trained_in_current_epoch > 0:
+                epoch_iterator = skip_first_batches(epoch_iterator, steps_trained_in_current_epoch)
+                steps_skipped = steps_trained_in_current_epoch
+                steps_trained_in_current_epoch = 0
+                rng_to_sync = True
+
             step = -1
             for step, inputs in enumerate(epoch_iterator):
+                total_batched_samples += 1
+                if rng_to_sync:
+                    self._load_rng_state(resume_from_checkpoint)
+                    rng_to_sync = False
+
                 # Skip past any already trained steps if resuming training
                 if steps_trained_in_current_epoch > 0:
                     steps_trained_in_current_epoch -= 1
@@ -404,18 +494,14 @@ def _inner_training_loop(
                     if self._compression_manager is not None:
                         self._compression_manager.callbacks.on_step_begin(step)
 
-                if (
-                    ((step + 1) % args.gradient_accumulation_steps != 0)
-                    and args.local_rank != -1
-                    and args._no_sync_in_gradient_accumulation
-                ):
-                    # Avoid unnecessary DDP synchronization since there will be no backward pass on this example.
-                    with model.no_sync():
-                        tr_loss_step = self.training_step(model, inputs)
-                else:
+                with self.accelerator.accumulate(model):
                     tr_loss_step = self.training_step(model, inputs)
 
-                if args.logging_nan_inf_filter and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)):
+                if (
+                    args.logging_nan_inf_filter
+                    and not is_torch_tpu_available()
+                    and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
+                ):
                     # if loss is nan or inf simply add the average of previous logged losses
                     tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
                 else:
@@ -423,35 +509,38 @@ def _inner_training_loop(
 
                 self.current_flos += float(self.floating_point_ops(inputs))
 
-                # Optimizer step for deepspeed must be called on every step regardless of the value of gradient_accumulation_steps
-                if self.deepspeed:
-                    self.deepspeed.step()
+                is_last_step_and_steps_less_than_grad_acc = (
+                    steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch
+                )
 
-                if (step + 1) % args.gradient_accumulation_steps == 0 or (
+                if (
+                    total_batched_samples % args.gradient_accumulation_steps == 0
+                    or
                     # last step in epoch but step is always smaller than gradient_accumulation_steps
-                    steps_in_epoch <= args.gradient_accumulation_steps
-                    and (step + 1) == steps_in_epoch
+                    is_last_step_and_steps_less_than_grad_acc
                 ):
+                    # the `or` condition of `is_last_step_and_steps_less_than_grad_acc` is not covered
+                    # in accelerate. So, explicitly enable sync gradients to True in that case.
+                    if is_last_step_and_steps_less_than_grad_acc or (
+                        version.parse(accelerate_version) <= version.parse("0.20.3")
+                    ):
+                        self.accelerator.gradient_state._set_sync_gradients(True)
+
                     # Gradient clipping
-                    if args.max_grad_norm is not None and args.max_grad_norm > 0 and not self.deepspeed:
+                    if args.max_grad_norm is not None and args.max_grad_norm > 0:
                         # deepspeed does its own clipping
 
-                        if self.do_grad_scaling:
-                            # AMP: gradients need unscaling
-                            self.scaler.unscale_(self.optimizer)
-
                         if is_sagemaker_mp_enabled() and args.fp16:
                             self.optimizer.clip_master_grads(args.max_grad_norm)
-                        elif hasattr(self.optimizer, "clip_grad_norm"):
-                            # Some optimizers (like the sharded optimizer) have a specific way to do gradient clipping
-                            self.optimizer.clip_grad_norm(args.max_grad_norm)
-                        elif hasattr(model, "clip_grad_norm_"):
-                            # Some models (like FullyShardedDDP) have a specific way to do gradient clipping
-                            model.clip_grad_norm_(args.max_grad_norm)
-                        else:
+                        elif self.use_apex:
                             # Revert to normal clipping otherwise, handling Apex or full precision
                             nn.utils.clip_grad_norm_(
-                                amp.master_params(self.optimizer) if self.use_apex else model.parameters(),
+                                amp.master_params(self.optimizer),
+                                args.max_grad_norm,
+                            )
+                        else:
+                            self.accelerator.clip_grad_norm_(
+                                model.parameters(),
                                 args.max_grad_norm,
                             )
 
@@ -459,27 +548,20 @@ def _inner_training_loop(
                         self._compression_manager.callbacks.on_before_optimizer_step()
 
                     # Optimizer step
-                    optimizer_was_run = True
-                    if self.deepspeed:
-                        pass  # called outside the loop
-                    elif self.do_grad_scaling:
-                        scale_before = self.scaler.get_scale()
-                        self.scaler.step(self.optimizer)
-                        self.scaler.update()
-                        scale_after = self.scaler.get_scale()
-                        optimizer_was_run = scale_before <= scale_after
-                    else:
-                        self.optimizer.step()
+                    self.optimizer.step()
 
                     if self._compression_manager is not None:
                         self._compression_manager.callbacks.on_after_optimizer_step()
 
-                    if optimizer_was_run and not self.deepspeed:
-                        self.lr_scheduler.step()
+                    optimizer_was_run = not self.accelerator.optimizer_step_was_skipped
+                    if optimizer_was_run:
+                        # Delay optimizer scheduling until metrics are generated
+                        if not isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
+                            self.lr_scheduler.step()
 
                     model.zero_grad()
                     self.state.global_step += 1
-                    self.state.epoch = epoch + (step + 1) / steps_in_epoch
+                    self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch
                     self.control = self.callback_handler.on_step_end(args, self.state, self.control)
                     if self._compression_manager is not None:
                         self._compression_manager.callbacks.on_step_end()
@@ -501,7 +583,6 @@ def _inner_training_loop(
             self.control = self.callback_handler.on_epoch_end(args, self.state, self.control)
             if self._compression_manager is not None:
                 self._compression_manager.callbacks.on_epoch_end()
-
             self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
 
             if self.control.should_training_stop:
@@ -513,9 +594,10 @@ def _inner_training_loop(
 
         logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n")
         if args.load_best_model_at_end and self.state.best_model_checkpoint is not None:
-            # Wait for everyone to get here so we are sur the model has been saved by process 0.
-
-            if args.local_rank != -1:
+            # Wait for everyone to get here so we are sure the model has been saved by process 0.
+            if is_torch_tpu_available():
+                xm.rendezvous("load_best_model_at_end")
+            elif args.parallel_mode == ParallelMode.DISTRIBUTED:
                 dist.barrier()
             elif is_sagemaker_mp_enabled():
                 smp.barrier()
@@ -526,7 +608,13 @@ def _inner_training_loop(
         self._total_loss_scalar += tr_loss.item()
         train_loss = self._total_loss_scalar / self.state.global_step
 
-        metrics = speed_metrics("train", start_time, num_samples=num_train_samples, num_steps=self.state.max_steps)
+        metrics = speed_metrics(
+            "train",
+            start_time,
+            num_samples=num_train_samples,
+            num_steps=self.state.max_steps,
+            num_tokens=num_train_tokens,
+        )
         self.store_flos()
         metrics["total_flos"] = self.state.total_flos
         metrics["train_loss"] = train_loss
@@ -537,7 +625,26 @@ def _inner_training_loop(
 
         self.log(metrics)
 
+        run_dir = self._get_output_dir(trial)
+        checkpoints_sorted = self._sorted_checkpoints(use_mtime=False, output_dir=run_dir)
+
+        # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save.
+        if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1:
+            for checkpoint in checkpoints_sorted:
+                if not os.path.samefile(checkpoint, self.state.best_model_checkpoint):
+                    logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
+                    shutil.rmtree(checkpoint)
+
         self.control = self.callback_handler.on_train_end(args, self.state, self.control)
+
+        # Wait for the checkpoint to be uploaded.
+        self._finish_current_push()
+
+        # After training we make sure to retrieve back the original forward pass method
+        # for the embedding layer by removing the forward post hook.
+        if self.neftune_noise_alpha is not None:
+            self._deactivate_neftune(self.model)
+
         if self._compression_manager is not None:
             self._compression_manager.callbacks.on_train_end()
 
diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py
index 0bba054ad3..17b0aa7058 100644
--- a/optimum/intel/openvino/trainer.py
+++ b/optimum/intel/openvino/trainer.py
@@ -16,6 +16,7 @@
 import io
 import math
 import os
+import shutil
 import sys
 import time
 from collections import defaultdict
@@ -23,8 +24,15 @@
 from pathlib import Path
 from typing import Callable, Dict, List, Optional, Tuple, Type, Union
 
+
+# Integrations must be imported before ML frameworks:
+# isort: off
+from transformers.integrations import hp_params
+from transformers.integrations.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_available
+
+# isort: on
+
 import openvino
-import openvino.runtime
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
@@ -46,40 +54,39 @@
     compress_quantize_weights_transformation,
 )
 from openvino.runtime import Core, PartialShape, save_model
+from packaging import version
+from torch import nn
 from torch.onnx import export as onnx_export
 from torch.utils._pytree import tree_map
-from torch.utils.data import DataLoader, Dataset, RandomSampler
-from torch.utils.data.distributed import DistributedSampler
-from tqdm.auto import tqdm
+from torch.utils.data import Dataset, RandomSampler
 from transformers import Trainer
 from transformers.data.data_collator import DataCollator
 from transformers.debug_utils import DebugOption, DebugUnderflowOverflow
-from transformers.deepspeed import deepspeed_init
-from transformers.integrations import hp_params
 from transformers.modeling_utils import PreTrainedModel, unwrap_model
 from transformers.pytorch_utils import is_torch_less_than_1_11
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 from transformers.trainer import TRAINER_STATE_NAME, TRAINING_ARGS_NAME
 from transformers.trainer_callback import TrainerCallback, TrainerState
-from transformers.trainer_pt_utils import IterableDatasetShard
+from transformers.trainer_pt_utils import get_dataloader_sampler, get_model_param_count
 from transformers.trainer_utils import (
     EvalPrediction,
     HPSearchBackend,
-    ShardedDDPOption,
     TrainOutput,
     has_length,
     speed_metrics,
 )
+from transformers.training_args import ParallelMode
 from transformers.utils import (
     WEIGHTS_NAME,
+    is_accelerate_available,
     is_apex_available,
     is_sagemaker_mp_enabled,
     is_torch_tpu_available,
     logging,
 )
 
+from optimum.exporters import TasksManager
 from optimum.exporters.onnx import OnnxConfig
-from optimum.exporters.tasks import TasksManager
 
 from ..utils.constant import _TASK_ALIASES
 from ..utils.import_utils import is_transformers_version
@@ -95,6 +102,22 @@
 )
 
 
+if is_accelerate_available():
+    from accelerate import __version__ as accelerate_version
+    from accelerate import skip_first_batches
+
+    if version.parse(accelerate_version) > version.parse("0.20.3"):
+        pass
+    DATA_SAMPLERS = [RandomSampler]
+    if version.parse(accelerate_version) > version.parse("0.23.0"):
+        from accelerate.data_loader import SeedableRandomSampler
+
+        DATA_SAMPLERS += [SeedableRandomSampler]
+
+    if is_deepspeed_available():
+        pass
+
+
 if is_apex_available():
     from apex import amp
 
@@ -171,6 +194,8 @@ def __init__(
         task: Optional[str] = None,
         feature: Optional[str] = None,
     ):
+        self.neftune_noise_alpha = None
+
         super().__init__(
             model,
             args,
@@ -244,7 +269,9 @@ def _set_signature_columns_if_needed(self):
     def _inner_training_loop(
         self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None
     ):
+        self.accelerator.free_memory()
         self._train_batch_size = batch_size
+        logger.debug(f"Currently training with a batch size of: {self._train_batch_size}")
         # Data loader and number of training steps
         train_dataloader = self.get_train_dataloader()
 
@@ -252,9 +279,10 @@ def _inner_training_loop(
         # number of training epochs: num_train_epochs
         # number of training steps per epoch: num_update_steps_per_epoch
         # total number of training steps to execute: max_steps
-        total_train_batch_size = args.train_batch_size * args.gradient_accumulation_steps * args.world_size
+        total_train_batch_size = self._train_batch_size * args.gradient_accumulation_steps * args.world_size
 
         len_dataloader = None
+        num_train_tokens = None
         if has_length(train_dataloader):
             len_dataloader = len(train_dataloader)
             num_update_steps_per_epoch = len_dataloader // args.gradient_accumulation_steps
@@ -268,10 +296,16 @@ def _inner_training_loop(
                 # May be slightly incorrect if the last batch in the training dataloader has a smaller size but it's
                 # the best we can do.
                 num_train_samples = args.max_steps * total_train_batch_size
+                if args.include_tokens_per_second:
+                    num_train_tokens = (
+                        self.num_tokens(train_dataloader, args.max_steps) * args.gradient_accumulation_steps
+                    )
             else:
                 max_steps = math.ceil(args.num_train_epochs * num_update_steps_per_epoch)
                 num_train_epochs = math.ceil(args.num_train_epochs)
                 num_train_samples = self.num_examples(train_dataloader) * args.num_train_epochs
+                if args.include_tokens_per_second:
+                    num_train_tokens = self.num_tokens(train_dataloader) * args.num_train_epochs
         elif args.max_steps > 0:  # Rely on max_steps when dataloader does not have a working size
             max_steps = args.max_steps
             # Setting a very large number of epochs so we go as many times as necessary over the iterator.
@@ -279,6 +313,8 @@ def _inner_training_loop(
             num_update_steps_per_epoch = max_steps
             num_examples = total_train_batch_size * args.max_steps
             num_train_samples = args.max_steps * total_train_batch_size
+            if args.include_tokens_per_second:
+                num_train_tokens = self.num_tokens(train_dataloader, args.max_steps) * args.gradient_accumulation_steps
         else:
             raise ValueError(
                 "args.max_steps must be set to a positive value if dataloader does not have a length, was"
@@ -287,7 +323,7 @@ def _inner_training_loop(
 
         if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug:
             if self.args.n_gpu > 1:
-                # torch.nn.DataParallel(model) replicates the model, creating new variables and module
+                # nn.DataParallel(model) replicates the model, creating new variables and module
                 # references registered here no longer work on other gpus, breaking the module
                 raise ValueError(
                     "Currently --debug underflow_overflow is not supported under DP. Please use DDP"
@@ -296,30 +332,47 @@ def _inner_training_loop(
             else:
                 debug_overflow = DebugUnderflowOverflow(self.model)  # noqa
 
-        delay_optimizer_creation = (
-            self.sharded_ddp is not None
-            and self.sharded_ddp != ShardedDDPOption.SIMPLE
-            or is_sagemaker_mp_enabled()
-            or self.fsdp is not None
-        )
-        if args.deepspeed:
-            deepspeed_engine, optimizer, lr_scheduler = deepspeed_init(
-                self, num_training_steps=max_steps, resume_from_checkpoint=resume_from_checkpoint
-            )
-            self.model = deepspeed_engine.module
-            self.model_wrapped = deepspeed_engine
-            self.deepspeed = deepspeed_engine
-            self.optimizer = optimizer
-            self.lr_scheduler = lr_scheduler
-        elif not delay_optimizer_creation:
+        delay_optimizer_creation = is_sagemaker_mp_enabled() or self.fsdp is not None or self.is_fsdp_enabled
+
+        # We need to reset the scheduler, as its parameters may be different on subsequent calls
+        if self._created_lr_scheduler:
+            self.lr_scheduler = None
+            self._created_lr_scheduler = False
+
+        if self.is_deepspeed_enabled:
+            self.optimizer, self.lr_scheduler = deepspeed_init(self, num_training_steps=max_steps)
+
+        if not delay_optimizer_creation:
             self.create_optimizer_and_scheduler(num_training_steps=max_steps)
 
         self.state = TrainerState()
         self.state.is_hyper_param_search = trial is not None
 
+        # Compute absolute values for logging, eval, and save if given as ratio
+        if args.logging_steps is not None:
+            if args.logging_steps < 1:
+                self.state.logging_steps = math.ceil(max_steps * args.logging_steps)
+            else:
+                self.state.logging_steps = args.logging_steps
+        if args.eval_steps is not None:
+            if args.eval_steps < 1:
+                self.state.eval_steps = math.ceil(max_steps * args.eval_steps)
+            else:
+                self.state.eval_steps = args.eval_steps
+        if args.save_steps is not None:
+            if args.save_steps < 1:
+                self.state.save_steps = math.ceil(max_steps * args.save_steps)
+            else:
+                self.state.save_steps = args.save_steps
+
         # Activate gradient checkpointing if needed
         if args.gradient_checkpointing:
-            self.model.gradient_checkpointing_enable()
+            if args.gradient_checkpointing_kwargs is None:
+                gradient_checkpointing_kwargs = {}
+            else:
+                gradient_checkpointing_kwargs = args.gradient_checkpointing_kwargs
+
+            self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs)
 
         if is_transformers_version("<", "4.29.0"):
             is_distributed = self.args.local_rank != -1
@@ -333,31 +386,67 @@ def _inner_training_loop(
 
         model = self._wrap_model(self.model_wrapped)
 
-        if is_sagemaker_mp_enabled() and resume_from_checkpoint is not None:
-            self._load_from_checkpoint(resume_from_checkpoint, model)
+        # as the model is wrapped, don't use `accelerator.prepare`
+        # this is for unhandled cases such as
+        # FSDP-XLA, SageMaker MP/DP, DataParallel, IPEX
+        use_accelerator_prepare = True if model is self.model else False
+
+        if delay_optimizer_creation:
+            if use_accelerator_prepare:
+                self.model = self.accelerator.prepare(self.model)
+            self.create_optimizer_and_scheduler(num_training_steps=max_steps)
+
+        # prepare using `accelerator` prepare
+        if use_accelerator_prepare:
+            self.model.train()
+            if hasattr(self.lr_scheduler, "step"):
+                if self.use_apex:
+                    model = self.accelerator.prepare(self.model)
+                else:
+                    model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
+            else:
+                # to handle cases wherein we pass "DummyScheduler" such as when it is specified in DeepSpeed config.
+                model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
+                    self.model, self.optimizer, self.lr_scheduler
+                )
+
+        if self.is_fsdp_enabled:
+            self.model = self.model_wrapped = model
 
         # for the rest of this function `model` is the outside model, whether it was wrapped or not
         if model is not self.model:
             self.model_wrapped = model
 
-        if delay_optimizer_creation:
-            self.create_optimizer_and_scheduler(num_training_steps=max_steps)
+        # backward compatibility
+        if self.is_deepspeed_enabled:
+            self.deepspeed = self.model_wrapped
+
+        # ckpt loading
+        if resume_from_checkpoint is not None:
+            if self.is_deepspeed_enabled:
+                deepspeed_load_checkpoint(self.model_wrapped, resume_from_checkpoint)
+            elif is_sagemaker_mp_enabled() or self.is_fsdp_enabled:
+                self._load_from_checkpoint(resume_from_checkpoint, self.model_wrapped)
 
         # Check if saved optimizer or scheduler states exist
         self._load_optimizer_and_scheduler(resume_from_checkpoint)
 
         # important: at this point:
         # self.model         is the Transformers Model
-        # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model), etc.
+        # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model),
+        # FSDP(Transformers Model), Dynamo Optimized Module(Transformers Model) etc.
 
         # Train!
         logger.info("***** Running training *****")
-        logger.info(f"  Num examples = {num_examples}")
-        logger.info(f"  Num Epochs = {num_train_epochs}")
-        logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
-        logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size}")
+        logger.info(f"  Num examples = {num_examples:,}")
+        logger.info(f"  Num Epochs = {num_train_epochs:,}")
+        logger.info(f"  Instantaneous batch size per device = {self.args.per_device_train_batch_size:,}")
+        if self.args.per_device_train_batch_size != self._train_batch_size:
+            logger.info(f"  Training with DataParallel so batch size has been adjusted to: {self._train_batch_size:,}")
+        logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}")
         logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
-        logger.info(f"  Total optimization steps = {max_steps}")
+        logger.info(f"  Total optimization steps = {max_steps:,}")
+        logger.info(f"  Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}")
 
         self.state.epoch = 0
         start_time = time.time()
@@ -382,20 +471,19 @@ def _inner_training_loop(
             logger.info(f"  Continuing training from global step {self.state.global_step}")
             if not args.ignore_data_skip:
                 logger.info(
-                    f"  Will skip the first {epochs_trained} epochs then the first {steps_trained_in_current_epoch} "
-                    "batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` "
-                    "flag to your launch command, but you will resume the training on data already seen by your model."
+                    f"  Will skip the first {epochs_trained} epochs then the first"
+                    f" {steps_trained_in_current_epoch} batches in the first epoch."
                 )
-                if self.is_local_process_zero() and not args.disable_tqdm:
-                    steps_trained_progress_bar = tqdm(total=steps_trained_in_current_epoch)
-                    steps_trained_progress_bar.set_description("Skipping the first batches")
 
         # Update the references
         self.callback_handler.model = self.model
         self.callback_handler.optimizer = self.optimizer
         self.callback_handler.lr_scheduler = self.lr_scheduler
         self.callback_handler.train_dataloader = train_dataloader
-        self.state.trial_name = self.hp_name(trial) if self.hp_name is not None else None
+        if self.hp_name is not None and self._trial is not None:
+            # use self._trial because the SigOpt/Optuna hpo only call `_hp_search_setup(trial)` instead of passing trial
+            # parameter to Train when using DDP.
+            self.state.trial_name = self.hp_name(self._trial)
         if trial is not None:
             assignments = trial.assignments if self.hp_search_backend == HPSearchBackend.SIGOPT else trial
             self.state.trial_params = hp_params(assignments)
@@ -408,6 +496,7 @@ def _inner_training_loop(
         self.state.is_local_process_zero = self.is_local_process_zero()
         self.state.is_world_process_zero = self.is_world_process_zero()
 
+        # tr_loss is a tensor to avoid synchronization of TPUs through .item()
         tr_loss = torch.tensor(0.0).to(args.device)
         self.compression_metrics = defaultdict(lambda: torch.tensor(0.0).to(args.device))
         # _total_loss_scalar is updated everytime .item() has to be called on tr_loss and stores the sum of all losses
@@ -420,31 +509,33 @@ def _inner_training_loop(
         # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point.
         if not args.ignore_data_skip:
             for epoch in range(epochs_trained):
-                is_random_sampler = hasattr(train_dataloader, "sampler") and isinstance(
-                    train_dataloader.sampler, RandomSampler
-                )
+                sampler = get_dataloader_sampler(train_dataloader)
+                sampler_kinds = [RandomSampler]
+                if version.parse(accelerate_version) > version.parse("0.23.0"):
+                    sampler_kinds.append(SeedableRandomSampler)
+                is_random_sampler = isinstance(sampler, tuple(sampler_kinds))
                 if is_torch_less_than_1_11 or not is_random_sampler:
                     # We just need to begin an iteration to create the randomization of the sampler.
-                    # That was before PyTorch 1.11 however...
                     for _ in train_dataloader:
                         break
                 else:
-                    # Otherwise we need to call the whole sampler cause there is some random operation added
+                    # Otherwise we need to call the whooooole sampler cause there is some random operation added
                     # AT THE VERY END!
-                    _ = list(train_dataloader.sampler)
+                    sampler = sampler if sampler is not None else []
+                    _ = list(sampler)
 
+        total_batched_samples = 0
         for epoch in range(epochs_trained, num_train_epochs):
-            if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler):
-                train_dataloader.sampler.set_epoch(epoch)
-            elif hasattr(train_dataloader, "dataset") and isinstance(train_dataloader.dataset, IterableDatasetShard):
-                train_dataloader.dataset.set_epoch(epoch)
+            epoch_iterator = train_dataloader
+            if hasattr(epoch_iterator, "set_epoch"):
+                epoch_iterator.set_epoch(epoch)
 
             # Reset the past mems state at the beginning of each epoch if necessary.
             if args.past_index >= 0:
                 self._past = None
 
             steps_in_epoch = (
-                len(train_dataloader)
+                len(epoch_iterator)
                 if len_dataloader is not None
                 else args.max_steps * args.gradient_accumulation_steps
             )
@@ -460,8 +551,21 @@ def _inner_training_loop(
             if epoch == epochs_trained and resume_from_checkpoint is not None and steps_trained_in_current_epoch == 0:
                 self._load_rng_state(resume_from_checkpoint)
 
+            rng_to_sync = False
+            steps_skipped = 0
+            if steps_trained_in_current_epoch > 0:
+                epoch_iterator = skip_first_batches(epoch_iterator, steps_trained_in_current_epoch)
+                steps_skipped = steps_trained_in_current_epoch
+                steps_trained_in_current_epoch = 0
+                rng_to_sync = True
+
             step = -1
-            for step, inputs in enumerate(train_dataloader):
+            for step, inputs in enumerate(epoch_iterator):
+                total_batched_samples += 1
+                if rng_to_sync:
+                    self._load_rng_state(resume_from_checkpoint)
+                    rng_to_sync = False
+
                 # Skip past any already trained steps if resuming training
                 if steps_trained_in_current_epoch > 0:
                     steps_trained_in_current_epoch -= 1
@@ -480,17 +584,14 @@ def _inner_training_loop(
                         # Must be called at the beginning of each training step to prepare the compression method
                         self.compression_controller.scheduler.step()
 
+                with self.accelerator.accumulate(model):
+                    tr_loss_step = self.training_step(model, inputs)
+
                 if (
-                    ((step + 1) % args.gradient_accumulation_steps != 0)
-                    and args.local_rank != -1
-                    and args._no_sync_in_gradient_accumulation
+                    args.logging_nan_inf_filter
+                    and not is_torch_tpu_available()
+                    and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
                 ):
-                    # Avoid unnecessary DDP synchronization since there will be no backward pass on this example.
-                    with model.no_sync():
-                        tr_loss_step = self.training_step(model, inputs)
-                else:
-                    tr_loss_step = self.training_step(model, inputs)
-                if args.logging_nan_inf_filter and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)):
                     # if loss is nan or inf simply add the average of previous logged losses
                     tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
                 else:
@@ -498,57 +599,52 @@ def _inner_training_loop(
 
                 self.current_flos += float(self.floating_point_ops(inputs))
 
-                # Optimizer step for deepspeed must be called on every step regardless of the value of gradient_accumulation_steps
-                if self.deepspeed:
-                    self.deepspeed.step()
+                is_last_step_and_steps_less_than_grad_acc = (
+                    steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch
+                )
 
-                if (step + 1) % args.gradient_accumulation_steps == 0 or (
+                if (
+                    total_batched_samples % args.gradient_accumulation_steps == 0
+                    or
                     # last step in epoch but step is always smaller than gradient_accumulation_steps
-                    steps_in_epoch <= args.gradient_accumulation_steps
-                    and (step + 1) == steps_in_epoch
+                    is_last_step_and_steps_less_than_grad_acc
                 ):
+                    # the `or` condition of `is_last_step_and_steps_less_than_grad_acc` is not covered
+                    # in accelerate. So, explicitly enable sync gradients to True in that case.
+                    if is_last_step_and_steps_less_than_grad_acc or (
+                        version.parse(accelerate_version) <= version.parse("0.20.3")
+                    ):
+                        self.accelerator.gradient_state._set_sync_gradients(True)
+
                     # Gradient clipping
-                    if args.max_grad_norm is not None and args.max_grad_norm > 0 and not self.deepspeed:
+                    if args.max_grad_norm is not None and args.max_grad_norm > 0:
                         # deepspeed does its own clipping
 
-                        if self.do_grad_scaling:
-                            # AMP: gradients need unscaling
-                            self.scaler.unscale_(self.optimizer)
-
                         if is_sagemaker_mp_enabled() and args.fp16:
                             self.optimizer.clip_master_grads(args.max_grad_norm)
-                        elif hasattr(self.optimizer, "clip_grad_norm"):
-                            # Some optimizers (like the sharded optimizer) have a specific way to do gradient clipping
-                            self.optimizer.clip_grad_norm(args.max_grad_norm)
-                        elif hasattr(model, "clip_grad_norm_"):
-                            # Some models (like FullyShardedDDP) have a specific way to do gradient clipping
-                            model.clip_grad_norm_(args.max_grad_norm)
-                        else:
+                        elif self.use_apex:
                             # Revert to normal clipping otherwise, handling Apex or full precision
-                            torch.nn.utils.clip_grad_norm_(
-                                amp.master_params(self.optimizer) if self.use_apex else model.parameters(),
+                            nn.utils.clip_grad_norm_(
+                                amp.master_params(self.optimizer),
+                                args.max_grad_norm,
+                            )
+                        else:
+                            self.accelerator.clip_grad_norm_(
+                                model.parameters(),
                                 args.max_grad_norm,
                             )
 
                     # Optimizer step
-                    optimizer_was_run = True
-                    if self.deepspeed:
-                        pass  # called outside the loop
-                    elif self.do_grad_scaling:
-                        scale_before = self.scaler.get_scale()
-                        self.scaler.step(self.optimizer)
-                        self.scaler.update()
-                        scale_after = self.scaler.get_scale()
-                        optimizer_was_run = scale_before <= scale_after
-                    else:
-                        self.optimizer.step()
-
-                    if optimizer_was_run and not self.deepspeed:
-                        self.lr_scheduler.step()
+                    self.optimizer.step()
+                    optimizer_was_run = not self.accelerator.optimizer_step_was_skipped
+                    if optimizer_was_run:
+                        # Delay optimizer scheduling until metrics are generated
+                        if not isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
+                            self.lr_scheduler.step()
 
                     model.zero_grad()
                     self.state.global_step += 1
-                    self.state.epoch = epoch + (step + 1) / steps_in_epoch
+                    self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch
                     self.control = self.callback_handler.on_step_end(args, self.state, self.control)
 
                     self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
@@ -559,7 +655,7 @@ def _inner_training_loop(
                     break
             if step < 0:
                 logger.warning(
-                    "There seems to be not a single sample in your train_dataloader, stopping training at step"
+                    "There seems to be not a single sample in your epoch_iterator, stopping training at step"
                     f" {self.state.global_step}! This is expected if you're using an IterableDataset and set"
                     f" num_steps ({max_steps}) higher than the number of available samples."
                 )
@@ -577,8 +673,10 @@ def _inner_training_loop(
 
         logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n")
         if args.load_best_model_at_end and self.state.best_model_checkpoint is not None:
-            # Wait for everyone to get here so we are sur the model has been saved by process 0.
-            if args.local_rank != -1:
+            # Wait for everyone to get here so we are sure the model has been saved by process 0.
+            if is_torch_tpu_available():
+                xm.rendezvous("load_best_model_at_end")
+            elif args.parallel_mode == ParallelMode.DISTRIBUTED:
                 dist.barrier()
             elif is_sagemaker_mp_enabled():
                 smp.barrier()
@@ -589,7 +687,13 @@ def _inner_training_loop(
         self._total_loss_scalar += tr_loss.item()
         train_loss = self._total_loss_scalar / self.state.global_step
 
-        metrics = speed_metrics("train", start_time, num_samples=num_train_samples, num_steps=self.state.max_steps)
+        metrics = speed_metrics(
+            "train",
+            start_time,
+            num_samples=num_train_samples,
+            num_steps=self.state.max_steps,
+            num_tokens=num_train_tokens,
+        )
         self.store_flos()
         metrics["total_flos"] = self.state.total_flos
         metrics["train_loss"] = train_loss
@@ -600,8 +704,26 @@ def _inner_training_loop(
 
         self.log(metrics)
 
+        run_dir = self._get_output_dir(trial)
+        checkpoints_sorted = self._sorted_checkpoints(use_mtime=False, output_dir=run_dir)
+
+        # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save.
+        if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1:
+            for checkpoint in checkpoints_sorted:
+                if not os.path.samefile(checkpoint, self.state.best_model_checkpoint):
+                    logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
+                    shutil.rmtree(checkpoint)
+
         self.control = self.callback_handler.on_train_end(args, self.state, self.control)
 
+        # Wait for the checkpoint to be uploaded.
+        self._finish_current_push()
+
+        # After training we make sure to retrieve back the original forward pass method
+        # for the embedding layer by removing the forward post hook.
+        if self.neftune_noise_alpha is not None:
+            self._deactivate_neftune(self.model)
+
         return TrainOutput(self.state.global_step, train_loss, metrics)
 
     def compute_distillation_loss(self, inputs, student_outputs):
diff --git a/setup.py b/setup.py
index 6d81b98b2a..0c1feace30 100644
--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@
 
 INSTALL_REQUIRE = [
     "optimum>=1.13.0",
-    "transformers>=4.20.0",
+    "transformers",
     "datasets>=1.4.0",
     "sentencepiece",
     "scipy",
@@ -41,8 +41,9 @@
         "neural-compressor>=2.2.0",
         "onnx",
         "onnxruntime<1.15.0",
+        "transformers>=4.33.0",
     ],
-    "openvino": ["openvino>=2023.1.0", "onnx", "onnxruntime"],
+    "openvino": ["openvino>=2023.1.0", "onnx", "onnxruntime", "transformers>=4.33.0"],
     "nncf": ["nncf>=2.6.0"],
     "ipex": ["transformers<4.32.0", "intel-extension-for-pytorch", "onnx"],
     "diffusers": ["diffusers"],
diff --git a/tests/neural_compressor/test_modeling.py b/tests/neural_compressor/test_modeling.py
index fc2a310595..8098f011c5 100644
--- a/tests/neural_compressor/test_modeling.py
+++ b/tests/neural_compressor/test_modeling.py
@@ -19,6 +19,7 @@
 import unittest
 
 import torch
+from packaging.version import Version, parse
 from parameterized import parameterized
 from transformers import AutoTokenizer, pipeline, set_seed
 
@@ -39,6 +40,7 @@
     INCTrainer,
 )
 from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS, WEIGHTS_NAME
+from optimum.version import __version__ as _optimum_version
 
 
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
@@ -133,6 +135,7 @@ def test_pipeline(self, model_id, task):
 
         pipe(*inputs)
 
+    @unittest.skipIf(parse(_optimum_version) < Version("1.14.0"), "not supported, needs optimum>=v1.14.0")
     def test_compare_with_and_without_past_key_values(self):
         model_id = "echarlaix/tiny-random-gpt2-torchscript"
         tokenizer = AutoTokenizer.from_pretrained(model_id)
diff --git a/tests/neural_compressor/test_onnx.py b/tests/neural_compressor/test_onnx.py
index f5dc0b7c66..387c369dd1 100644
--- a/tests/neural_compressor/test_onnx.py
+++ b/tests/neural_compressor/test_onnx.py
@@ -54,7 +54,7 @@ def test_static_quantization(self, task, model_name, expected_quantized_matmuls)
             tokenizer.pad_token = tokenizer.eos_token
         quantizer = INCQuantizer.from_pretrained(model, task=task)
         calibration_dataset = _generate_dataset(quantizer, tokenizer, num_samples=num_samples)
-        save_onnx_model = True
+        save_onnx_model = False
         op_type_dict = (
             {"Embedding": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}}}
             if save_onnx_model

From 95622358edda0783e7a60d396992284452d81b07 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Mon, 6 Nov 2023 11:34:52 +0100
Subject: [PATCH 6/9] Fix OV trainer model saving step (#472)

---
 optimum/intel/openvino/trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/optimum/intel/openvino/trainer.py b/optimum/intel/openvino/trainer.py
index 17b0aa7058..dfc659882c 100644
--- a/optimum/intel/openvino/trainer.py
+++ b/optimum/intel/openvino/trainer.py
@@ -827,12 +827,12 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None):
             if state_dict is None:
                 state_dict = self.model.state_dict()
             if is_pretrained_model:
-                unwrapped_model.save_pretrained(output_dir, state_dict=state_dict)
+                unwrapped_model.save_pretrained(output_dir, state_dict=state_dict, safe_serialization=False)
             else:
                 logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
                 torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
         else:
-            self.model.save_pretrained(output_dir, state_dict=state_dict)
+            self.model.save_pretrained(output_dir, state_dict=state_dict, safe_serialization=False)
 
         if self.tokenizer is not None:
             self.tokenizer.save_pretrained(output_dir)

From 99a3970de77bfcebe491abd41ff672f7db5b7a9f Mon Sep 17 00:00:00 2001
From: Helena Kloosterman <helena.kloosterman@intel.com>
Date: Mon, 6 Nov 2023 14:03:31 +0100
Subject: [PATCH 7/9] Add OpenVINO weights compression to docs (#435)

* Add weights compression to docs

* Update optimization_ov.mdx
---
 docs/source/optimization_ov.mdx | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/docs/source/optimization_ov.mdx b/docs/source/optimization_ov.mdx
index 866573dca9..0f51d3cb60 100644
--- a/docs/source/optimization_ov.mdx
+++ b/docs/source/optimization_ov.mdx
@@ -62,6 +62,27 @@ tokenizer.save_pretrained(save_dir)
 
 The `quantize()` method applies post-training static quantization and export the resulting quantized model to the OpenVINO Intermediate Representation (IR). The resulting graph is represented with two files: an XML file describing the network topology and a binary file describing the weights. The resulting model can be run on any target Intel device.
 
+### Weights compression
+
+For large language models (LLMs), it is often beneficial to only quantize weights, and keep activations in floating point precision. This method does not require a calibration dataset. To enable weights compression, set the `weights_only` parameter of `OVQuantizer`:
+
+```python
+from optimum.intel.openvino import OVQuantizer, OVModelForCausalLM
+from transformers import AutoModelForCausalLM
+
+save_dir = "int8_weights_compressed_model"
+model = AutoModelForCausalLM.from_pretrained("databricks/dolly-v2-3b")
+quantizer = OVQuantizer.from_pretrained(model, task="text-generation")
+quantizer.quantize(save_directory=save_dir, weights_only=True)
+```
+
+To load the optimized model for inference:
+
+```python
+optimized_model = OVModelForCausalLM.from_pretrained(save_dir)
+```
+
+Weights compression is enabled for PyTorch and OpenVINO models: the starting model can be an `AutoModelForCausalLM` or `OVModelForCausalLM` instance.
 
 ## Training-time optimization
 
@@ -221,4 +242,4 @@ text = "He's a dreadful magician."
 outputs = cls_pipe(text)
 
 [{'label': 'NEGATIVE', 'score': 0.9840195178985596}]
-```
\ No newline at end of file
+```

From c5ed584503f3eb19ba7c8f661e94aebffc3d08e9 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Mon, 6 Nov 2023 14:03:56 +0100
Subject: [PATCH 8/9] Add CLI openvino export in documentation (#440)

* add cli openvino export readme

* minor

* add int8

* add in documentation

* add int8 section

* format

* add comment
---
 README.md                 |  43 +++++++--
 docs/source/inference.mdx | 196 ++++++++++++++++++++++++--------------
 2 files changed, 161 insertions(+), 78 deletions(-)

diff --git a/README.md b/README.md
index e06f91ef17..9f25eefd94 100644
--- a/README.md
+++ b/README.md
@@ -67,26 +67,51 @@ For more details on the supported compression techniques, please refer to the [d
 
 Below are the examples of how to use OpenVINO and its [NNCF](https://docs.openvino.ai/latest/tmo_introduction.html) framework to accelerate inference.
 
+#### Export:
+
+It is possible to export your model to the [OpenVINO](https://docs.openvino.ai/2023.1/openvino_ir.html) IR format with the CLI :
+
+```plain
+optimum-cli export openvino --model gpt2 ov_model
+```
+
+If you add `--int8`, the weights will be quantized to INT8, the activations will be kept in floating point precision.
+
+```plain
+optimum-cli export openvino --model gpt2 --int8 ov_model
+```
+
+
 #### Inference:
 
 To load a model and run inference with OpenVINO Runtime, you can just replace your `AutoModelForXxx` class with the corresponding `OVModelForXxx` class.
-If you want to load a PyTorch checkpoint, set `export=True` to convert your model to the OpenVINO IR.
+
 
 ```diff
-- from transformers import AutoModelForSequenceClassification
-+ from optimum.intel import OVModelForSequenceClassification
+- from transformers import AutoModelForSeq2SeqLM
++ from optimum.intel import OVModelForSeq2SeqLM
   from transformers import AutoTokenizer, pipeline
 
-  model_id = "distilbert-base-uncased-finetuned-sst-2-english"
-- model = AutoModelForSequenceClassification.from_pretrained(model_id)
-+ model = OVModelForSequenceClassification.from_pretrained(model_id, export=True)
+  model_id = "echarlaix/t5-small-openvino"
+- model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
++ model = OVModelForSeq2SeqLM.from_pretrained(model_id)
   tokenizer = AutoTokenizer.from_pretrained(model_id)
-  model.save_pretrained("./distilbert")
+  pipe = pipeline("translation_en_to_fr", model=model, tokenizer=tokenizer)
+  results = pipe("He never went out without a book under his arm, and he often came back with two.")
 
-  classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
-  results = classifier("He's a dreadful magician.")
+  [{'translation_text': "Il n'est jamais sorti sans un livre sous son bras, et il est souvent revenu avec deux."}]
 ```
 
+If you want to load a PyTorch checkpoint, set `export=True` to convert your model to the OpenVINO IR.
+
+```python
+from optimum.intel import OVModelForCausalLM
+
+model = OVModelForCausalLM.from_pretrained("gpt2", export=True)
+model.save_pretrained("./ov_model")
+```
+
+
 #### Post-training static quantization:
 
 Post-training static quantization introduces an additional calibration step where data is fed through the network in order to compute the activations quantization parameters. Here is an example on how to apply static quantization on a fine-tuned DistilBERT.
diff --git a/docs/source/inference.mdx b/docs/source/inference.mdx
index f0a6d2edab..bfd15bde11 100644
--- a/docs/source/inference.mdx
+++ b/docs/source/inference.mdx
@@ -11,34 +11,82 @@ specific language governing permissions and limitations under the License.
 
 Optimum Intel can be used to load optimized models from the [Hugging Face Hub](https://huggingface.co/models?library=openvino&sort=downloads) and create pipelines to run inference with OpenVINO Runtime without rewriting your APIs.
 
-## Switching from Transformers to Optimum
+## Transformers models
 
 You can now easily perform inference with OpenVINO Runtime on a variety of Intel processors ([see](https://docs.openvino.ai/latest/openvino_docs_OV_UG_supported_plugins_Supported_Devices.html) the full list of supported devices). 
 For that, just replace the `AutoModelForXxx` class with the corresponding `OVModelForXxx` class.
-To load a Transformers model and convert it to the OpenVINO format on-the-fly, you can set `export=True` when loading your model.
 
-Here is an example on how to perform inference with OpenVINO Runtime for a text classification class:
+As shown in the table below, each task is associated with a class enabling to automatically load your model.
+
+| Task                                 | Auto Class                           |
+|--------------------------------------|--------------------------------------|
+| `text-classification`                | `OVModelForSequenceClassification`   |
+| `token-classification`               | `OVModelForTokenClassification`      |
+| `question-answering`                 | `OVModelForQuestionAnswering`        |
+| `audio-classification`               | `OVModelForAudioClassification`      |
+| `image-classification`               | `OVModelForImageClassification`      |
+| `feature-extraction`                 | `OVModelForFeatureExtraction`        |
+| `fill-mask`                          | `OVModelForMaskedLM`                 |
+| `text-generation`                    | `OVModelForCausalLM`                 |
+| `text2text-generation`               | `OVModelForSeq2SeqLM`                |
+
+
+### Export
+
+It is possible to export your model to the [OpenVINO](https://docs.openvino.ai/2023.1/openvino_ir.html) IR format with the CLI :
+
+```bash
+optimum-cli export openvino --model gpt2 ov_model
+```
+
+The example above illustrates exporting a checkpoint from the 🤗 Hub. When exporting a local model, first make sure that you saved both the model’s weights and tokenizer files in the same directory (`local_path`).
+When using CLI, pass the `local_path` to the model argument instead of the checkpoint name of the model hosted on the Hub and provide the `--task` argument. You can review the list of supported tasks in the 🤗 [Optimum documentation](https://huggingface.co/docs/optimum/exporters/task_manager). If task argument is not provided, it will default to the model architecture without any task specific head.
+Here we set the `task` to `text-generation-with-past`, with the `-with-past` suffix enabling the re-use of the pre-computed key/values hidden-states `use_cache=True`.
+
+```bash
+optimum-cli export openvino --model local_path --task text-generation-with-past ov_model
+```
+
+Once the model is exported, you can load the OpenVINO model using :
+
+```python
+from optimum.intel import AutoModelForCausalLM
+
+model_id = "helenai/gpt2-ov"
+model = AutoModelForCausalLM.from_pretrained(model_id)
+```
+
+You can also load your PyTorch checkpoint and convert it to the OpenVINO format on-the-fly, by setting `export=True` when loading your model.
+
+```python
+from optimum.intel import AutoModelForCausalLM
+
+model_id = "gpt2"
+model = AutoModelForCausalLM.from_pretrained(model_id, export=True)
+model.save_pretrained("ov_model")
+```
+
+### Inference
+
+You can load an OpenVINO hosted on the hub and perform inference, no need to adapt your code to get it to work with `OVModelForXxx` classes:
 
 ```diff
-- from transformers import AutoModelForSequenceClassification
-+ from optimum.intel import OVModelForSequenceClassification
+- from transformers import AutoModelForCausalLM
++ from optimum.intel import OVModelForCausalLM
   from transformers import AutoTokenizer, pipeline
 
-  model_id = "distilbert-base-uncased-finetuned-sst-2-english"
-- model = AutoModelForSequenceClassification.from_pretrained(model_id)
-+ model = OVModelForSequenceClassification.from_pretrained(model_id, export=True)
+  model_id = "helenai/gpt2-ov"
+- model = AutoModelForCausalLM.from_pretrained(model_id)
++ model = OVModelForCausalLM.from_pretrained(model_id)
   tokenizer = AutoTokenizer.from_pretrained(model_id)
-  cls_pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
-  outputs = cls_pipe("He's a dreadful magician.")
-
-  [{'label': 'NEGATIVE', 'score': 0.9919503927230835}]
+  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+  results = pipe("He's a dreadful magician and")
 ```
 
 See the [reference documentation](reference_ov) for more information about parameters, and examples for different tasks.
 
 To easily save the resulting model, you can use the `save_pretrained()` method, which will save both the BIN and XML files describing the graph. It is useful to save the tokenizer to the same directory, to enable easy loading of the tokenizer for the model.
 
-
 ```python
 # Save the exported model
 save_directory = "openvino_distilbert"
@@ -46,6 +94,28 @@ model.save_pretrained(save_directory)
 tokenizer.save_pretrained(save_directory)
 ```
 
+### Weight only quantization
+
+You can also apply INT8 quantization on your models weights when exporting your model with the CLI:
+
+```bash
+optimum-cli export openvino --model gpt2 --int8 ov_model
+```
+
+This will results in the exported model linear and embedding layers to be quanrtized to INT8, the activations will be kept in floating point precision.
+
+This can also be done when loading your model by setting the `load_in_8bit` argument when calling the `from_pretrained()` method.
+
+```python
+from optimum.intel import OVModelForCausalLM
+
+model = OVModelForCausalLM.from_pretrained(model_id, load_in_8bit=True)
+```
+
+To apply quantization on both weights and activations, you can use the `OVQuantizer`, more information in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov#optimization).
+
+### Static shape
+
 By default, `OVModelForXxx` support dynamic shapes, enabling inputs of every shapes. To speed up inference, static shapes can be enabled by giving the desired inputs shapes.
 
 ```python
@@ -55,7 +125,6 @@ model.reshape(1, 9)
 model.compile()
 ```
 
-
 When fixing the shapes with the `reshape()` method, inference cannot be performed with an input of a different shape. When instantiating your pipeline, you can specify the maximum total input sequence length after tokenization in order for shorter sequences to be padded and for longer sequences to be truncated.
 
 ```python
@@ -81,16 +150,7 @@ qa_pipe = pipeline(
 metric = task_evaluator.compute(model_or_pipeline=qa_pipe, data=eval_dataset, metric="squad")
 ```
 
-
-To run inference on Intel integrated or discrete GPU, use `.to("gpu")`. On GPU, models run in FP16 precision by default. (See [OpenVINO documentation](https://docs.openvino.ai/nightly/openvino_docs_install_guides_configurations_for_intel_gpu.html) about installing drivers for GPU inference).
-
-```python
-# Static shapes speed up inference
-model.reshape(1, 9)
-model.to("gpu")
-# Compile the model before the first inference
-model.compile()
-```
+### Compilation
 
 By default the model will be compiled when instantiating our `OVModel`. In the case where the model is reshaped or placed to another device, the model will need to be recompiled again, which will happen by default before the first inference (thus inflating the latency of the first inference). To avoid an unnecessary compilation, you can disable the first compilation by setting `compile=False`. The model can be compiled before the first inference with `model.compile()`.
 
@@ -106,6 +166,19 @@ model.reshape(1,128)
 model.compile()
 ```
 
+To run inference on Intel integrated or discrete GPU, use `.to("gpu")`. On GPU, models run in FP16 precision by default. (See [OpenVINO documentation](https://docs.openvino.ai/nightly/openvino_docs_install_guides_configurations_for_intel_gpu.html) about installing drivers for GPU inference).
+
+```python
+# Static shapes speed up inference
+model.reshape(1, 9)
+model.to("gpu")
+# Compile the model before the first inference
+model.compile()
+```
+
+### Configuration
+
+
 It is possible to pass an `ov_config` parameter to `from_pretrained()` with custom OpenVINO configuration values. This can be used for example to enable full precision inference on devices where FP16 or BF16 inference precision is used by default.
 
 
@@ -120,7 +193,7 @@ Optimum Intel leverages OpenVINO's model caching to speed up model compiling. By
 model = OVModelForSequenceClassification.from_pretrained(model_id, ov_config={"CACHE_DIR":""})
 ```
 
-## Sequence-to-sequence models
+### Sequence-to-sequence models
 
 Sequence-to-sequence (Seq2Seq) models, that generate a new sequence from an input, can also be used when running inference with OpenVINO. When Seq2Seq models are exported to the OpenVINO IR, they are decomposed into two parts : the encoder and the "decoder" (which actually consists of the decoder with the language modeling head), that are later combined during inference.
 To speed up sequential decoding, a cache with pre-computed key/values hidden-states will be used by default. An additional model component will be exported: the "decoder" with pre-computed key/values as one of its inputs.  This specific export comes from the fact that during the first pass, the decoder has no pre-computed key/values hidden-states, while during the rest of the generation past key/values will be used to speed up sequential decoding. To disable this cache, set `use_cache=False` in the `from_pretrained()` method.
@@ -147,23 +220,33 @@ tokenizer.save_pretrained(save_directory)
 [{'translation_text': "Il n'est jamais sorti sans un livre sous son bras, et il est souvent revenu avec deux."}]
 ```
 
-## Stable Diffusion
+## Diffusers models
+
+Make sure you have 🤗 Diffusers installed.
+
+To install `diffusers`:
+```bash
+pip install optimum[diffusers]
+```
+
+
+### Stable Diffusion
 
 Stable Diffusion models can also be used when running inference with OpenVINO. When Stable Diffusion models
-are exported to the OpenVINO format, they are decomposed into three components that are later combined during inference:
+are exported to the OpenVINO format, they are decomposed into different components that are later combined during inference:
 - The text encoder
 - The U-NET
 - The VAE encoder
 - The VAE decoder
 
-Make sure you have 🤗 Diffusers installed.
+| Task                                 | Auto Class                           |
+|--------------------------------------|--------------------------------------|
+| `text-to-image`                      | `OVStableDiffusionPipeline`          |
+| `image-to-image`                     | `OVStableDiffusionImg2ImgPipeline`   |
+| `inpaint`                            | `OVStableDiffusionInpaintPipeline`   |
 
-To install `diffusers`:
-```bash
-pip install optimum[diffusers]
-```
 
-### Text-to-Image
+#### Text-to-Image
 Here is an example of how you can load an OpenVINO Stable Diffusion model and run inference using OpenVINO Runtime:
 
 ```python
@@ -208,7 +291,7 @@ In case you want to change any parameters such as the outputs height or width, y
     <img src="https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/stable_diffusion_v1_5_sail_boat_rembrandt.png">
 </div>
 
-### Text-to-Image with Textual Inversion
+#### Text-to-Image with Textual Inversion
 Here is an example of how you can load an OpenVINO Stable Diffusion model with pre-trained textual inversion embeddings and run inference using OpenVINO Runtime:
 
 
@@ -248,7 +331,7 @@ The left image shows the generation result of original stable diffusion v1.5, th
 | ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/textual_inversion/stable_diffusion_v1_5_without_textual_inversion.png) | ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/textual_inversion/stable_diffusion_v1_5_with_textual_inversion.png) |
 
 
-### Image-to-Image
+#### Image-to-Image
 
 ```python
 import requests
@@ -269,16 +352,15 @@ image = pipeline(prompt=prompt, image=init_image, strength=0.75, guidance_scale=
 image.save("fantasy_landscape.png")
 ```
 
-## Stable Diffusion XL
+### Stable Diffusion XL
 
-Before using `OVtableDiffusionXLPipeline` make sure to have `diffusers` and `invisible_watermark` installed. You can install the libraries as follows:
+| Task                                 | Auto Class                           |
+|--------------------------------------|--------------------------------------|
+| `text-to-image`                      | `OVStableDiffusionXLPipeline`        |
+| `image-to-image`                     | `OVStableDiffusionXLImg2ImgPipeline` |
 
-```bash
-pip install diffusers
-pip install invisible-watermark>=0.2.0
-```
 
-### Text-to-Image
+#### Text-to-Image
 
 Here is an example of how you can load a SDXL OpenVINO model from [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) and run inference using OpenVINO Runtime:
 
@@ -296,7 +378,7 @@ image.save("train_station.png")
 |---|---|
 | ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/sd_xl/train_station_friedrich.png) | ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/sd_xl/train_station_friedrich_2.png) |
 
-### Text-to-Image with Textual Inversion
+#### Text-to-Image with Textual Inversion
 
 Here is an example of how you can load an SDXL OpenVINO model from [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) with pre-trained textual inversion embeddings and run inference using OpenVINO Runtime:
 
@@ -338,7 +420,7 @@ The left image shows the generation result of the original SDXL base 1.0, the ri
 | ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/textual_inversion/sdxl_without_textual_inversion.png) | ![](https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/textual_inversion/sdxl_with_textual_inversion.png) |
 
 
-### Image-to-Image
+#### Image-to-Image
 
 Here is an example of how you can load a PyTorch SDXL model, convert it to OpenVINO on-the-fly and run inference using OpenVINO Runtime for *image-to-image*:
 
@@ -358,7 +440,7 @@ pipeline.save_pretrained("openvino-sd-xl-refiner-1.0")
 ```
 
 
-### Refining the image output
+#### Refining the image output
 
 The image can be refined by making use of a model like [stabilityai/stable-diffusion-xl-refiner-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0). In this case, you only have to output the latents from the base model.
 
@@ -372,27 +454,3 @@ refiner = OVStableDiffusionXLImg2ImgPipeline.from_pretrained(model_id, export=Tr
 image = base(prompt=prompt, output_type="latent").images[0]
 image = refiner(prompt=prompt, image=image[None, :]).images[0]
 ```
-
-
-
-## Supported tasks
-
-As shown in the table below, each task is associated with a class enabling to automatically load your model.
-
-
-| Task                                 | Auto Class                           |
-|--------------------------------------|--------------------------------------|
-| `text-classification`                | `OVModelForSequenceClassification`   |
-| `token-classification`               | `OVModelForTokenClassification`      |
-| `question-answering`                 | `OVModelForQuestionAnswering`        |
-| `audio-classification`               | `OVModelForAudioClassification`      |
-| `image-classification`               | `OVModelForImageClassification`      |
-| `feature-extraction`                 | `OVModelForFeatureExtraction`        |
-| `fill-mask`                          | `OVModelForMaskedLM`                 |
-| `text-generation`                    | `OVModelForCausalLM`                 |
-| `text2text-generation`               | `OVModelForSeq2SeqLM`                |
-| `text-to-image`                      | `OVStableDiffusionPipeline`          |
-| `text-to-image`                      | `OVStableDiffusionXLPipeline`        |
-| `image-to-image`                     | `OVStableDiffusionImg2ImgPipeline`   |
-| `image-to-image`                     | `OVStableDiffusionXLImg2ImgPipeline` |
-| `inpaint`                            | `OVStableDiffusionInpaintPipeline`   |

From bf8e95c2beb75cdf37cfa8eee5144a6934ed6eb4 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Mon, 6 Nov 2023 23:31:46 +0100
Subject: [PATCH 9/9] Fix compatibility for optimum v1.14.0 (#460)

* Enable openvino inference for gpt big code models

* fix

* format

* fix input names

* Fix export optimum modifications
---
 optimum/exporters/openvino/__main__.py     |  29 ++---
 optimum/intel/generation/modeling.py       | 131 ++++++++++-----------
 optimum/intel/openvino/modeling_base.py    |  15 +--
 optimum/intel/openvino/modeling_decoder.py |  22 +++-
 optimum/intel/openvino/quantization.py     |   8 +-
 optimum/intel/utils/modeling_utils.py      |  41 ++++++-
 setup.py                                   |   4 +-
 tests/generation/test_modeling.py          |  22 +++-
 tests/openvino/test_modeling.py            |   8 +-
 tests/openvino/test_quantization.py        |   4 +-
 tests/openvino/utils_tests.py              |   4 +-
 11 files changed, 169 insertions(+), 119 deletions(-)

diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py
index b17d93aa5e..cb011706c8 100644
--- a/optimum/exporters/openvino/__main__.py
+++ b/optimum/exporters/openvino/__main__.py
@@ -27,7 +27,6 @@
 from optimum.utils.save_utils import maybe_load_preprocessors, maybe_save_preprocessors
 
 from ...intel.utils.import_utils import is_nncf_available
-from ...intel.utils.modeling_utils import patch_decoder_attention_mask
 from .convert import export_models
 
 
@@ -257,24 +256,18 @@ class StoreAttr(object):
     preprocessors = maybe_load_preprocessors(
         model_name_or_path, subfolder=subfolder, trust_remote_code=trust_remote_code
     )
-    if not task.startswith("text-generation"):
-        onnx_config, models_and_onnx_configs = optimum_main._get_submodels_and_onnx_configs(
-            model=model,
-            task=task,
-            monolith=False,
-            custom_onnx_configs=custom_onnx_configs if custom_onnx_configs is not None else {},
-            custom_architecture=custom_architecture,
-            fn_get_submodels=fn_get_submodels,
-            preprocessors=preprocessors,
-            _variant="default",
-        )
-    else:
-        # TODO : ModelPatcher will be added in next optimum release
-        model = patch_decoder_attention_mask(model)
 
-        onnx_config_constructor = TasksManager.get_exporter_config_constructor(model=model, exporter="onnx", task=task)
-        onnx_config = onnx_config_constructor(model.config)
-        models_and_onnx_configs = {"model": (model, onnx_config)}
+    onnx_config, models_and_onnx_configs = optimum_main._get_submodels_and_onnx_configs(
+        model=model,
+        task=task,
+        monolith=False,
+        custom_onnx_configs=custom_onnx_configs if custom_onnx_configs is not None else {},
+        custom_architecture=custom_architecture,
+        fn_get_submodels=fn_get_submodels,
+        preprocessors=preprocessors,
+        _variant="default",
+        legacy=False,
+    )
 
     if int8 is None:
         int8 = False
diff --git a/optimum/intel/generation/modeling.py b/optimum/intel/generation/modeling.py
index b4c41e0be1..bbfc3db63d 100644
--- a/optimum/intel/generation/modeling.py
+++ b/optimum/intel/generation/modeling.py
@@ -26,12 +26,13 @@
 from transformers.utils import WEIGHTS_NAME
 
 from optimum.exporters import TasksManager
+from optimum.exporters.onnx import MODEL_TYPES_REQUIRING_POSITION_IDS
 from optimum.modeling_base import OptimizedModel
 from optimum.utils import NormalizedConfigManager
 
 from ..utils.constant import _TASK_ALIASES
 from ..utils.import_utils import is_torch_version, is_transformers_version
-from ..utils.modeling_utils import _prepare_attn_mask, _prepare_decoder_attention_mask
+from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS, patch_decoder_attention_mask
 
 
 if is_transformers_version("<", "4.25.0"):
@@ -47,43 +48,29 @@ def prepare_jit_inputs(model: PreTrainedModel, task: str, use_cache: bool = Fals
     task = _TASK_ALIASES.get(task, task)
     signature = inspect.signature(model.forward) if hasattr(model, "forward") else inspect.signature(model.__call__)
     onnx_config_class = TasksManager.get_exporter_config_constructor(model=model, exporter="onnx", task=task)
-    onnx_config = onnx_config_class(model.config)
-    if task == "text-generation" and use_cache:
-        onnx_config = onnx_config_class(model.config, use_past=True, use_past_in_inputs=True)
+    if "text-generation" in task:
+        onnx_config = onnx_config_class(model.config, use_past=use_cache, use_past_in_inputs=use_cache)
+    else:
+        onnx_config = onnx_config_class(model.config)
+
     dummy_inputs = onnx_config.generate_dummy_inputs(framework="pt")
-    model_inputs = {key: dummy_inputs[key] for key in signature.parameters if dummy_inputs.get(key, None) is not None}
-    if task == "text-generation" and use_cache and model.config.model_type != "gpt_bigcode":
-        # WA jit.trace issue of model like llama in https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L464, or else, generation output will be incorrect
-        pkv = []
-        for i in range(len(model_inputs["past_key_values"])):
-            pkv.append([])
-            for j in range(len(model_inputs["past_key_values"][0])):
-                pkv[i].append(model_inputs["past_key_values"][i][j].to(model.dtype))
-            pkv[i] = tuple(pkv[i])
-        model_inputs["past_key_values"] = tuple(pkv)
-        i = model_inputs["input_ids"]
-        a = model_inputs["attention_mask"]
-        model_inputs["input_ids"] = torch.cat([torch.zeros(i.shape[0], 1), i], -1).to(i.dtype)
-        model_inputs["attention_mask"] = torch.cat([torch.zeros(a.shape[0], 1), a], -1).to(a.dtype)
-    return model_inputs
+
+    return {key: dummy_inputs[key] for key in signature.parameters if dummy_inputs.get(key, None) is not None}
 
 
 def jit_trace(model: PreTrainedModel, task: str, use_cache: bool = False):
     model_inputs = prepare_jit_inputs(model, task, use_cache)
     # check if the model_inputs is correct.
     model(**model_inputs)
+
     torch._C._jit_set_texpr_fuser_enabled(False)
     if "past_key_values" in model_inputs.keys():
         model.config.return_dict = False
-        if is_torch_version(">", "2.0.1"):
-            traced_model = torch.jit.trace(model, example_kwarg_inputs=model_inputs, strict=False)
-        else:
-            traced_model = torch.jit.trace(model, example_inputs=tuple(model_inputs.values()), strict=False)
+    if is_torch_version(">=", "2.1.0"):
+        traced_model = torch.jit.trace(model, example_kwarg_inputs=model_inputs, strict=False)
     else:
-        if is_torch_version(">=", "2.0.0"):
-            traced_model = torch.jit.trace(model, example_kwarg_inputs=model_inputs, strict=False)
-        else:
-            traced_model = torch.jit.trace(model, example_inputs=tuple(model_inputs.values()), strict=False)
+        traced_model = torch.jit.trace(model, example_inputs=tuple(model_inputs.values()), strict=False)
+
     traced_model = torch.jit.freeze(traced_model.eval())
     traced_model(**model_inputs)
     traced_model(**model_inputs)
@@ -91,11 +78,7 @@ def jit_trace(model: PreTrainedModel, task: str, use_cache: bool = False):
     return traced_model
 
 
-class PreTrainedModel(OptimizedModel):
-    pass
-
-
-class BaseModelForCausalLM(PreTrainedModel, GenerationMixin):
+class BaseModelForCausalLM(OptimizedModel, GenerationMixin):
     auto_model_class = AutoModelForCausalLM
     export_feature = "text-generation"
     main_input_name = "input_ids"
@@ -156,12 +139,23 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg
             if past_key_values[0][0].shape[0] == input_ids.shape[0]:
                 past_key_values = self._convert_to_bloom_cache(past_key_values)
 
+        position_ids = kwargs.get("position_ids", None)
+
+        attention_mask = kwargs.get("attention_mask", None)
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+
         return {
             "input_ids": input_ids,
             "past_key_values": past_key_values,
             "use_cache": self.use_cache,
-            "position_ids": None,
-            "attention_mask": kwargs.get("attention_mask", None),
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
             "token_type_ids": None,
         }
 
@@ -258,6 +252,7 @@ def forward(
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        position_ids: Optional[torch.FloatTensor] = None,
         **kwargs,
     ) -> CausalLMOutputWithPast:
         if attention_mask is None:
@@ -268,43 +263,42 @@ def forward(
             "attention_mask": attention_mask,
         }
 
+        model_type = self.config.model_type.replace("_", "-")
+
         if self.use_cache:
             if past_key_values is None:
                 nb_pkv = 2
                 num_layers = self.normalized_config.num_layers
-                num_attention_heads = self.normalized_config.num_attention_heads
-                num_key_value_heads = num_attention_heads
-                if hasattr(self.normalized_config, "num_key_value_heads"):
-                    num_key_value_heads = self.normalized_config.num_key_value_heads
-                hidden_size = self.normalized_config.hidden_size
-                d_k = hidden_size // num_attention_heads
-                if self.config.model_type == "gpt_bigcode":
-                    new_shape = [input_ids.shape[0], 0, d_k * 2]
-                    empty_tensor = torch.empty(size=new_shape)
-                    if self.model_dtype is not None:
-                        empty_tensor = empty_tensor.to(self.model_dtype)
-                    past_key_values = tuple([empty_tensor] * num_layers)
-                elif self.config.model_type != "bloom":
-                    new_shape = [input_ids.shape[0], num_key_value_heads, 0, d_k]
-                    empty_tensor = torch.empty(size=new_shape)
-                    if self.model_dtype is not None:
-                        empty_tensor = empty_tensor.to(self.model_dtype)
-                    pkv = tuple(empty_tensor for _ in range(nb_pkv))
+                d_k = self.normalized_config.hidden_size // self.normalized_config.num_attention_heads
+                batch_size = input_ids.shape[0]
+
+                if model_type in {"mistral", "llama"}:
+                    num_attention_heads = self.normalized_config.num_key_value_heads
                 else:
-                    pkv = ()
-                    for nb_pkv in range(nb_pkv):
-                        if nb_pkv % 2 == 0:
-                            new_shape = [input_ids.shape[0] * num_key_value_heads, d_k, 0]
-                        else:
-                            new_shape = [input_ids.shape[0] * num_key_value_heads, 0, d_k]
-                        empty_tensor = torch.empty(size=new_shape)
-                        if self.model_dtype is not None:
-                            empty_tensor = empty_tensor.to(self.model_dtype)
-                        pkv = pkv + (empty_tensor,)
-                if past_key_values is None:
-                    past_key_values = tuple(tuple(pkv) for _ in range(num_layers))
+                    num_attention_heads = self.normalized_config.num_attention_heads
+
+                if model_type == "bloom":
+                    shape_key = (batch_size * num_attention_heads, d_k, 0)
+                    shape_value = (batch_size * num_attention_heads, 0, d_k)
+                    key = torch.empty(size=shape_key, dtype=self.model_dtype, device=self._device)
+                    value = torch.empty(size=shape_value, dtype=self.model_dtype, device=self._device)
+                    past_key_values = tuple(
+                        tuple(key if idx % 2 == 0 else value for idx in range(nb_pkv)) for _ in range(num_layers)
+                    )
+                elif model_type.replace("-", "_") in MULTI_QUERY_ATTN_MODELS:
+                    shape = (batch_size, 0, d_k * 2)
+                    pkv = torch.empty(size=shape, dtype=self.model_dtype, device=self._device)
+                    past_key_values = tuple(pkv for _ in range(num_layers))
+                else:
+                    shape = (batch_size, num_attention_heads, 0, d_k)
+                    pkv = torch.empty(size=shape, dtype=self.model_dtype, device=self._device)
+                    past_key_values = tuple(tuple(pkv for _ in range(nb_pkv)) for _ in range(num_layers))
 
             inputs["past_key_values"] = past_key_values
+
+        if position_ids is not None and model_type in MODEL_TYPES_REQUIRING_POSITION_IDS:
+            inputs["position_ids"] = position_ids
+
         outputs = self.model(**inputs)
 
         if isinstance(outputs, (list, tuple)):
@@ -389,7 +383,7 @@ def _from_transformers(
         torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
         **kwargs,
     ):
-        if is_torch_version("<", "2.0.0"):
+        if is_torch_version("<", "2.1.0"):
             raise ImportError("`torch>=2.0.0` is needed to trace your model")
 
         task = cls.export_feature
@@ -405,12 +399,7 @@ def _from_transformers(
         }
 
         model = TasksManager.get_model_from_task(task, model_id, **model_kwargs)
-
-        if model.config.model_type == "bloom":
-            model.transformer._prepare_attn_mask = _prepare_attn_mask
-
-        if model.config.model_type == "llama":
-            model.model._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
+        model = patch_decoder_attention_mask(model)
 
         traced_model = jit_trace(model, task, use_cache)
         save_dir = TemporaryDirectory()
diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py
index eb1ed88467..67e8d20502 100644
--- a/optimum/intel/openvino/modeling_base.py
+++ b/optimum/intel/openvino/modeling_base.py
@@ -43,17 +43,12 @@
 logger = logging.getLogger(__name__)
 
 
-# workaround to enable compatibility between openvino models and transformers pipelines
-class PreTrainedModel(OptimizedModel):
-    pass
-
-
 @add_start_docstrings(
     """
     Base OVModel class.
     """,
 )
-class OVBaseModel(PreTrainedModel):
+class OVBaseModel(OptimizedModel):
     auto_model_class = None
     export_feature = None
 
@@ -86,6 +81,12 @@ def __init__(
             input_names[next((name for name in names if "/" not in name), names[0])] = idx
         self.input_names = input_names
 
+        output_names = {}
+        for idx, key in enumerate(model.outputs):
+            names = tuple(key.get_names())
+            output_names[next((name for name in names if "/" not in name), names[0])] = idx
+        self.output_names = output_names
+
         self.model = model
         self.request = None
         if enable_compilation:
@@ -302,7 +303,7 @@ def _from_transformers(
     @classmethod
     def _to_load(
         cls,
-        model: PreTrainedModel,
+        model,
         config: PretrainedConfig,
         onnx_config: OnnxConfig,
         use_auth_token: Optional[Union[bool, str]] = None,
diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py
index 4d87b7eec2..0fa21e3a4a 100644
--- a/optimum/intel/openvino/modeling_decoder.py
+++ b/optimum/intel/openvino/modeling_decoder.py
@@ -129,7 +129,6 @@ def __init__(
         self.main_input_name = "input_ids"
         self.num_pkv = 2
         self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config)
-        self.output_names = {key.get_any_name(): idx for idx, key in enumerate(self.model.outputs)}
         self.key_value_input_names = [key for key in self.input_names if "key_values" in key]
         self.key_value_output_names = [key for key in self.output_names if "present" in key]
         self._original_model = self.model.clone()  # keep original model for serialization
@@ -313,6 +312,7 @@ def forward(
         input_ids: torch.LongTensor,
         attention_mask: Optional[torch.LongTensor] = None,
         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        position_ids: Optional[torch.LongTensor] = None,
         **kwargs,
     ) -> CausalLMOutputWithPast:
         self.compile()
@@ -362,14 +362,28 @@ def forward(
 
         inputs["input_ids"] = np.array(input_ids)
         # Add the attention_mask inputs when needed
-        if "attention_mask" in self.input_names:
+        if "attention_mask" in self.input_names or "position_ids" in self.input_names:
             if attention_mask is not None:
-                inputs["attention_mask"] = np.array(attention_mask)
+                attention_mask = np.array(attention_mask)
             else:
-                inputs["attention_mask"] = np.ones(
+                attention_mask = np.ones(
                     (input_ids.shape[0], input_ids.shape[1] + past_len), dtype=inputs["input_ids"].dtype
                 )
 
+        if "attention_mask" in self.input_names:
+            inputs["attention_mask"] = attention_mask
+
+        if "position_ids" in self.input_names:
+            if position_ids is not None:
+                position_ids = np.array(position_ids)
+            else:
+                position_ids = np.cumsum(attention_mask, axis=1) - 1
+                position_ids[attention_mask == 0] = 1
+                if past_key_values:
+                    position_ids = np.expand_dims(position_ids[:, -1], axis=-1)
+
+            inputs["position_ids"] = position_ids
+
         # Run inference
         self.request.start_async(inputs, shared_memory=True)
         self.request.wait()
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index bcc7c2908b..b94f61214d 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -39,7 +39,6 @@
 
 from ...exporters.openvino import export, export_pytorch_via_onnx
 from ..utils.constant import _TASK_ALIASES
-from ..utils.modeling_utils import patch_decoder_attention_mask
 from .configuration import OVConfig
 from .modeling_base import OVBaseModel
 from .modeling_decoder import OVBaseDecoderModel
@@ -394,9 +393,10 @@ def _quantize_torchmodel(
         task = self.task
         model = self.model
         self.model.config.save_pretrained(save_directory)
-        model = patch_decoder_attention_mask(model)
-        if task == "text-generation":
-            onnx_config = onnx_config_class(model.config, use_past=model.config.use_cache)
+        if task.startswith("text-generation"):
+            onnx_config = onnx_config_class(
+                model.config, use_past=model.config.use_cache, use_past_in_inputs=model.config.use_cache
+            )
         else:
             onnx_config = onnx_config_class(model.config)
 
diff --git a/optimum/intel/utils/modeling_utils.py b/optimum/intel/utils/modeling_utils.py
index b56e5e4f2d..1a3b6fbede 100644
--- a/optimum/intel/utils/modeling_utils.py
+++ b/optimum/intel/utils/modeling_utils.py
@@ -18,9 +18,6 @@
 from transformers.modeling_utils import PreTrainedModel
 
 
-# from ...utils.modeling_utils import _prepare_decoder_sliding_window_attention_mask
-
-
 MULTI_QUERY_ATTN_MODELS = {"falcon", "gpt_bigcode"}
 
 
@@ -98,6 +95,40 @@ def _prepare_decoder_attention_mask(attention_mask, input_shape, inputs_embeds,
     return combined_attention_mask
 
 
+# Modified from transformers.models.mistral.modeling_mistral._prepare_decoder_sliding_window_attention_mask
+def _prepare_decoder_sliding_window_attention_mask(
+    attention_mask: torch.Tensor,
+    input_shape: Tuple[int, int],
+    inputs_embeds: torch.Tensor,
+    past_key_values_length: int,
+    sliding_window: int,
+):
+    from transformers.models.mistral.modeling_mistral import _expand_mask, _make_sliding_window_causal_mask
+
+    # create causal mask
+    # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+    combined_attention_mask = None
+
+    combined_attention_mask = _make_sliding_window_causal_mask(
+        input_shape,
+        device=inputs_embeds.device,
+        dtype=inputs_embeds.dtype,
+        past_key_values_length=past_key_values_length,
+        sliding_window=sliding_window,
+    )
+
+    if attention_mask is not None:
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+            inputs_embeds.device
+        )
+        combined_attention_mask = (
+            expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+        )
+
+    return combined_attention_mask
+
+
 def patch_decoder_attention_mask(model: "PreTrainedModel"):
     """
     Apply patch on decoder with past model forward to resolve first inference based on model architecture
@@ -112,8 +143,8 @@ def patch_decoder_attention_mask(model: "PreTrainedModel"):
         model.transformer._prepare_attn_mask = _prepare_attn_mask
     elif model.config.model_type == "llama":
         model.model._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
-    # elif model.config.model_type == "mistral":
-    #     model.model._prepare_decoder_attention_mask = _prepare_decoder_sliding_window_attention_mask
+    elif model.config.model_type == "mistral":
+        model.model._prepare_decoder_attention_mask = _prepare_decoder_sliding_window_attention_mask
     elif model.config.model_type in {"blenderbot-small", "blenderbot", "opt", "pegasus", "bart"}:
         model.model.decoder._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
     return model
diff --git a/setup.py b/setup.py
index 0c1feace30..7949f6d11d 100644
--- a/setup.py
+++ b/setup.py
@@ -12,8 +12,8 @@
     assert False, "Error: Could not open '%s' due %s\n" % (filepath, error)
 
 INSTALL_REQUIRE = [
-    "optimum>=1.13.0",
-    "transformers",
+    "optimum>=1.14.0",
+    "transformers>=4.20.0",
     "datasets>=1.4.0",
     "sentencepiece",
     "scipy",
diff --git a/tests/generation/test_modeling.py b/tests/generation/test_modeling.py
index 0fd668ad8f..db36b924f4 100644
--- a/tests/generation/test_modeling.py
+++ b/tests/generation/test_modeling.py
@@ -20,6 +20,7 @@
 from parameterized import parameterized
 from transformers import AutoModelForCausalLM, AutoTokenizer, PretrainedConfig, pipeline, set_seed
 
+from optimum.exporters.onnx import MODEL_TYPES_REQUIRING_POSITION_IDS
 from optimum.intel.generation.modeling import TSModelForCausalLM
 
 
@@ -28,6 +29,9 @@
     "gptj": "hf-internal-testing/tiny-random-gptj",
     "gpt2": "hf-internal-testing/tiny-random-gpt2",
     "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel",
+    "mistral": "echarlaix/tiny-random-mistral",
+    "llama": "fxmarty/tiny-llama-fast-tokenizer",
+    "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel",
 }
 
 SEED = 42
@@ -48,7 +52,11 @@ class ModelingIntegrationTest(unittest.TestCase):
         "gpt2",
         "gptj",
         "gpt_neo",
+        "mistral",
+        "llama",
+        # "gpt_bigcode",
     )
+
     GENERATION_LENGTH = 100
     SPEEDUP_CACHE = 1.1
 
@@ -61,7 +69,12 @@ def test_compare_to_transformers(self, model_arch):
         trfs_model = AutoModelForCausalLM.from_pretrained(model_id)
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         tokens = tokenizer("This is a sample", return_tensors="pt")
-        outputs = model(**tokens)
+
+        position_ids = None
+        if model_arch.replace("_", "-") in MODEL_TYPES_REQUIRING_POSITION_IDS:
+            input_shape = tokens["input_ids"].shape
+            position_ids = torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1])
+        outputs = model(**tokens, position_ids=position_ids)
         self.assertIsInstance(outputs.logits, torch.Tensor)
         with torch.no_grad():
             trfs_outputs = trfs_model(**tokens)
@@ -71,7 +84,8 @@ def test_compare_to_transformers(self, model_arch):
         with tempfile.TemporaryDirectory() as tmpdirname:
             model.save_pretrained(tmpdirname)
             loaded_model = TSModelForCausalLM.from_pretrained(tmpdirname)
-            loaded_model_outputs = loaded_model(**tokens)
+            loaded_model_outputs = loaded_model(**tokens, position_ids=position_ids)
+
         self.assertTrue(torch.equal(outputs.logits, loaded_model_outputs.logits))
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
@@ -120,7 +134,6 @@ def test_compare_with_and_without_past_key_values(self):
         model_id = MODEL_NAMES["gpt2"]
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         tokens = tokenizer("This is a sample input", return_tensors="pt")
-
         model_with_pkv = TSModelForCausalLM.from_pretrained(model_id, export=True, use_cache=True)
         # Warmup
         _ = model_with_pkv.generate(**tokens)
@@ -136,6 +149,9 @@ def test_compare_with_and_without_past_key_values(self):
             outputs_model_without_pkv = model_without_pkv.generate(
                 **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
             )
+        self.assertTrue(model_with_pkv.use_cache)
+        self.assertFalse(model_without_pkv.use_cache)
+
         self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv))
         self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH)
         self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH)
diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py
index f3978b2965..c29e8c2eef 100644
--- a/tests/openvino/test_modeling.py
+++ b/tests/openvino/test_modeling.py
@@ -51,6 +51,7 @@
 from transformers.onnx.utils import get_preprocessor
 from utils_tests import MODEL_NAMES
 
+from optimum.exporters.onnx import MODEL_TYPES_REQUIRING_POSITION_IDS
 from optimum.intel import (
     OVModelForAudioClassification,
     OVModelForAudioFrameClassification,
@@ -496,7 +497,12 @@ def test_compare_to_transformers(self, model_arch):
         tokens = tokenizer(
             "This is a sample", return_tensors="pt", return_token_type_ids=False if model_arch == "llama" else None
         )
-        ov_outputs = ov_model(**tokens)
+        position_ids = None
+        if model_arch.replace("_", "-") in MODEL_TYPES_REQUIRING_POSITION_IDS:
+            input_shape = tokens["input_ids"].shape
+            position_ids = torch.arange(0, input_shape[-1], dtype=torch.long).unsqueeze(0).view(-1, input_shape[-1])
+        ov_outputs = ov_model(**tokens, position_ids=position_ids)
+
         self.assertTrue("logits" in ov_outputs)
         self.assertIsInstance(ov_outputs.logits, torch.Tensor)
         with torch.no_grad():
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index c1ec95ea9b..3154ae1133 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -63,7 +63,7 @@ class OVQuantizerTest(unittest.TestCase):
     # TODO : add models
     SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (
         (OVModelForSequenceClassification, "hf-internal-testing/tiny-random-bert", 32, 35),
-        (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 41, 22),
+        (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 41, 23),
     )
 
     @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS)
@@ -145,7 +145,7 @@ class OVWeightCompressionTest(unittest.TestCase):
     # TODO : add models
     SUPPORTED_ARCHITECTURES_WITH_EXPECTED_COMPRESSED_MATMULS = (
         (OVModelForSequenceClassification, "hf-internal-testing/tiny-random-bert", 70, 35),
-        (OVModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 45, 22),
+        (OVModelForCausalLM, "hf-internal-testing/tiny-random-BartForCausalLM", 27, 14),
     )
 
     SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION = (
diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py
index 2fa77052eb..8d89d24e18 100644
--- a/tests/openvino/utils_tests.py
+++ b/tests/openvino/utils_tests.py
@@ -24,7 +24,7 @@
     "bart": "hf-internal-testing/tiny-random-bart",
     "bigbird_pegasus": "hf-internal-testing/tiny-random-bigbird_pegasus",
     "blenderbot-small": "hf-internal-testing/tiny-random-BlenderbotModel",
-    "blenderbot": "hf-internal-testing/tiny-random-blenderbot",
+    "blenderbot": "hf-internal-testing/tiny-random-BlenderbotModel",
     "bloom": "hf-internal-testing/tiny-random-BloomModel",
     "camembert": "hf-internal-testing/tiny-random-camembert",
     "convbert": "hf-internal-testing/tiny-random-ConvBertForSequenceClassification",
@@ -102,7 +102,7 @@
     "albert": (42,),
     "vit": (31,),
     "blenderbot": (35,),
-    "gpt2": (22,),
+    "gpt2": (23,),
     "wav2vec2": (15,),
     "distilbert": (33,),
     "t5": (32, 52, 42),