From bf88c927fcdced8c725fb875ba9813c724b60b5b Mon Sep 17 00:00:00 2001
From: "Feng, Jiqing" <jiqing.feng@intel.com>
Date: Tue, 9 Jan 2024 18:14:34 -0800
Subject: [PATCH 1/8] add IPEX model and README

update ipex modeling and add case for text-generation and text-classification

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
---
 README.md                                     |  25 +-
 .../text-classification/run_classification.py |  11 +
 .../ipex/text-generation/run_generation.py    |  12 +
 optimum/intel/generation/modeling.py          |   2 +-
 optimum/intel/ipex/__init__.py                |   3 +
 optimum/intel/ipex/modeling_base.py           | 228 ++++++++++++++++++
 optimum/intel/ipex/modeling_decoder.py        |  41 ++++
 7 files changed, 320 insertions(+), 2 deletions(-)
 create mode 100644 examples/ipex/text-classification/run_classification.py
 create mode 100644 examples/ipex/text-generation/run_generation.py
 create mode 100644 optimum/intel/ipex/modeling_base.py
 create mode 100644 optimum/intel/ipex/modeling_decoder.py

diff --git a/README.md b/README.md
index 54d8371b5b..047c411248 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,8 @@
 
 🤗 Optimum Intel is the interface between the 🤗 Transformers and Diffusers libraries and the different tools and libraries provided by Intel to accelerate end-to-end pipelines on Intel architectures.
 
+[Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) is an open-source library which provides optimizations for both eager mode and graph mode, however, compared to eager mode, graph mode in PyTorch* normally yields better performance from optimization techniques, such as operation fusion.
+
 Intel [Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) is an open-source library enabling the usage of the most popular compression techniques such as quantization, pruning and knowledge distillation. It supports automatic accuracy-driven tuning strategies in order for users to easily generate quantized model. The users can easily apply static, dynamic and aware-training quantization approaches while giving an expected accuracy criteria. It also supports different weight pruning techniques enabling the creation of pruned model giving a predefined sparsity target.
 
 [OpenVINO](https://docs.openvino.ai/latest/index.html) is an open-source toolkit that enables high performance inference capabilities for Intel CPUs, GPUs, and special DL inference accelerators ([see](https://docs.openvino.ai/latest/openvino_docs_OV_UG_supported_plugins_Supported_Devices.html) the full list of supported devices). It is supplied with a set of tools to optimize your models with compression techniques such as quantization, pruning and knowledge distillation. Optimum Intel provides a simple interface to optimize your Transformers and Diffusers models, convert them to the OpenVINO Intermediate Representation (IR) format and run inference using OpenVINO Runtime.
@@ -17,6 +19,7 @@ To install the latest release of 🤗 Optimum Intel with the corresponding requi
 
 | Accelerator                                                                                                      | Installation                                                         |
 |:-----------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------|
+| [Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction)                 | `pip install --upgrade-strategy eager "optimum[ipex]"`               |
 | [Intel Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade-strategy eager "optimum[neural-compressor]"`  |
 | [OpenVINO](https://docs.openvino.ai/latest/index.html)                                                           | `pip install --upgrade-strategy eager "optimum[openvino,nncf]"`      |
 
@@ -37,10 +40,30 @@ or to install from source including dependencies:
 python -m pip install "optimum-intel[extras]"@git+https://github.com/huggingface/optimum-intel.git
 ```
 
-where `extras` can be one or more of `neural-compressor`, `openvino`, `nncf`.
+where `extras` can be one or more of `ipex`, `neural-compressor`, `openvino`, `nncf`.
 
 # Quick tour
 
+## Intel Extension for PyTorch
+To load a model and run generation with IPEX graph mode, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class.
+```diff
+import torch
+from transformers import AutoTokenizer, pipeline
+- from transformers import AutoModelForCausalLM
++ from optimum.intel.ipex.modeling_decoder import IPEXModelForCausalLM
+
+
+model_id = "gpt2"
+- model = AutoModelForCausalLM.from_pretrained(model_id)
++ model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, torch_dtype=torch.bfloat16)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
+
+print(text_generator("This is an example input"))
+```
+
+For now, we only support text-generation tasks.
+
 ## Neural Compressor
 
 Dynamic quantization can be used through the Optimum command-line interface:
diff --git a/examples/ipex/text-classification/run_classification.py b/examples/ipex/text-classification/run_classification.py
new file mode 100644
index 0000000000..e0f6255fe0
--- /dev/null
+++ b/examples/ipex/text-classification/run_classification.py
@@ -0,0 +1,11 @@
+import torch
+from transformers import AutoTokenizer, pipeline
+
+from optimum.intel.ipex.modeling_base import IPEXModelForSequenceClassification
+
+
+model_id = "distilbert-base-uncased-finetuned-sst-2-english"
+model = IPEXModelForSequenceClassification.from_pretrained(model_id, export=True, torch_dtype=torch.bfloat16)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+text_classifer = pipeline("text-classification", model=model, tokenizer=tokenizer)
+print(text_classifer("This movie is disgustingly good !"))
diff --git a/examples/ipex/text-generation/run_generation.py b/examples/ipex/text-generation/run_generation.py
new file mode 100644
index 0000000000..fe57e37cb1
--- /dev/null
+++ b/examples/ipex/text-generation/run_generation.py
@@ -0,0 +1,12 @@
+import torch
+from transformers import AutoTokenizer, pipeline
+
+from optimum.intel.ipex.modeling_decoder import IPEXModelForCausalLM
+
+
+model_id = "gpt2"
+model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, torch_dtype=torch.bfloat16)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
+
+print(text_generator("This is an example input"))
diff --git a/optimum/intel/generation/modeling.py b/optimum/intel/generation/modeling.py
index 07cb8bc98c..b2d091f87b 100644
--- a/optimum/intel/generation/modeling.py
+++ b/optimum/intel/generation/modeling.py
@@ -102,6 +102,7 @@ def __init__(
         self.model_save_dir = model_save_dir
         self.preprocessors = kwargs.get("preprocessors", [])
         self.use_cache = use_cache
+        ## TO do: add XPU support
         self._device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
         self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config)
         self.model_dtype = kwargs.get("model_dtype", None)
@@ -282,7 +283,6 @@ def forward(
             inputs["position_ids"] = position_ids
 
         model_type = self.config.model_type.replace("_", "-")
-
         if self.use_cache:
             if past_key_values is None:
                 nb_pkv = 2
diff --git a/optimum/intel/ipex/__init__.py b/optimum/intel/ipex/__init__.py
index bb1d9c270b..710063b980 100644
--- a/optimum/intel/ipex/__init__.py
+++ b/optimum/intel/ipex/__init__.py
@@ -1 +1,4 @@
 from .inference import inference_mode
+
+
+generation_tasks = ("text-generation",)
diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
new file mode 100644
index 0000000000..6bfb474069
--- /dev/null
+++ b/optimum/intel/ipex/modeling_base.py
@@ -0,0 +1,228 @@
+import logging
+import os
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from typing import Optional, Union
+
+import intel_extension_for_pytorch as ipex
+import torch
+from huggingface_hub import hf_hub_download
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    AutoModelForSequenceClassification,
+    GenerationMixin,
+    PretrainedConfig,
+)
+from transformers.models.auto.auto_factory import _get_model_class
+from transformers.utils import WEIGHTS_NAME
+
+from optimum.exporters import TasksManager
+from optimum.modeling_base import OptimizedModel
+
+from ..utils.import_utils import is_torch_version
+from ..utils.modeling_utils import patch_decoder_attention_mask
+from . import generation_tasks
+
+
+SUPPORT_MODEL_LIST_FOR_CAUSAL_LM = {
+    #  "llama": LlamaForCausalLM
+}
+
+SUPPORT_TASK_LIST = {"text-generation": SUPPORT_MODEL_LIST_FOR_CAUSAL_LM}
+from ..generation.modeling import jit_trace
+
+
+logger = logging.getLogger(__name__)
+
+
+class IPEXModel(OptimizedModel):
+    auto_model_class = AutoModel
+    export_feature = "feature-extraction"
+    base_model_prefix = "ipex_model"
+
+    def __init__(
+        self,
+        model,
+        config: PretrainedConfig = None,
+        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        use_cache: bool = True,
+        **kwargs,
+    ):
+        OptimizedModel.__init__(self, model=model, config=config)
+        # To do: add XPU support
+        self._device = torch.device("cpu")
+        self.model.to(self._device)
+
+        # Registers the IPEXModelForXXX classes into the transformers AutoModel classes to avoid warnings when creating
+        # a pipeline https://github.com/huggingface/transformers/blob/cad61b68396a1a387287a8e2e2fef78a25b79383/src/transformers/pipelines/base.py#L863
+        AutoConfig.register(self.base_model_prefix, AutoConfig)
+        if hasattr(self.auto_model_class, "register"):
+            self.auto_model_class.register(AutoConfig, self.__class__)
+
+    @classmethod
+    def _from_transformers(
+        cls,
+        model_id: str,
+        config: PretrainedConfig,
+        use_auth_token: Optional[Union[bool, str]] = None,
+        revision: Optional[str] = None,
+        force_download: bool = False,
+        cache_dir: Optional[str] = None,
+        subfolder: str = "",
+        local_files_only: bool = False,
+        use_cache: bool = True,
+        torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
+        **kwargs,
+    ):
+        if is_torch_version("<", "2.1.0"):
+            raise ImportError("`torch>=2.0.0` is needed to trace your model")
+        task = cls.export_feature
+        model_kwargs = {
+            "revision": revision,
+            "use_auth_token": use_auth_token,
+            "cache_dir": cache_dir,
+            "subfolder": subfolder,
+            "local_files_only": local_files_only,
+            "force_download": force_download,
+            "use_cache": use_cache,
+            "torch_dtype": torch_dtype,
+            "device": "cpu",
+        }
+        if task not in generation_tasks:
+            model_kwargs.pop("use_cache")
+        model_type = None
+        support_ipex_transformers = False
+        if task in SUPPORT_TASK_LIST.keys():
+            for name in SUPPORT_TASK_LIST[task].keys():
+                if name in model_id:
+                    support_ipex_transformers = True
+                    model_type = name
+                    break
+
+        if support_ipex_transformers and task in SUPPORT_TASK_LIST and model_type in SUPPORT_TASK_LIST[task]:
+            # model = SUPPORT_TASK_LIST[task][model_type].from_pretrained(model_id, **model_kwargs)
+            pass
+        else:
+            model = TasksManager.get_model_from_task(task, model_id, **model_kwargs)
+            model = patch_decoder_attention_mask(model)
+
+        model = ipex.optimize(model, dtype=torch_dtype, level="O1", auto_kernel_selection=True)
+
+        if kwargs.pop("jit", True):
+            try:
+                traced_model = cls.apply_jit_optimize(model, task, use_cache, support_ipex_transformers)
+                save_dir = TemporaryDirectory()
+                save_dir_path = Path(save_dir.name)
+                torch.jit.save(traced_model, save_dir_path / WEIGHTS_NAME)
+                config.torchscript = True
+
+                return cls._from_pretrained(
+                    model_id=save_dir_path,
+                    config=config,
+                    use_cache=use_cache,
+                    use_auth_token=use_auth_token,
+                    revision=revision,
+                    force_download=force_download,
+                    cache_dir=cache_dir,
+                    local_files_only=local_files_only,
+                    model_dtype=torch_dtype,
+                    **kwargs,
+                )
+            except Exception as e:
+                logger.warning(f"failed to use PyTorch jit mode due to: {e}.")
+
+        return cls(
+            model,
+            config=config,
+            use_cache=use_cache,
+            model_dtype=torch_dtype,
+            **kwargs,
+        )
+
+    @classmethod
+    def _from_pretrained(
+        cls,
+        model_id: Union[str, Path],
+        config: PretrainedConfig,
+        use_auth_token: Optional[Union[bool, str, None]] = None,
+        revision: Optional[Union[str, None]] = None,
+        force_download: bool = False,
+        cache_dir: Optional[str] = None,
+        file_name: Optional[str] = WEIGHTS_NAME,
+        local_files_only: bool = False,
+        use_cache: bool = True,
+        **kwargs,
+    ):
+        # Load the model from local directory
+        if os.path.isdir(model_id):
+            model_cache_path = os.path.join(model_id, file_name)
+            model_save_dir = model_id
+        # Download the model from the hub
+        else:
+            model_cache_path = hf_hub_download(
+                repo_id=model_id,
+                filename=file_name,
+                use_auth_token=use_auth_token,
+                revision=revision,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                local_files_only=local_files_only,
+            )
+            model_save_dir = Path(model_cache_path).parent
+
+        if getattr(config, "torchscript", False):
+            model = torch.jit.load(model_cache_path)
+            torch.jit.freeze(model.eval())
+        else:
+            model_class = _get_model_class(config, cls.auto_model_class._model_mapping)
+            model = model_class.from_pretrained(model_save_dir)
+
+        return cls(
+            model,
+            config=config,
+            model_save_dir=model_save_dir,
+            use_cache=use_cache,
+            **kwargs,
+        )
+
+    def _save_pretrained(self, save_directory: Union[str, Path], file_name: Optional[str] = None, **kwargs):
+        if getattr(self.config, "torchscript", False):
+            torch.jit.save(self.model, os.path.join(save_directory, WEIGHTS_NAME))
+        else:
+            torch.save(self.model, os.path.join(save_directory, WEIGHTS_NAME))
+
+    def forward(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
+    def eval(self):
+        self.model.eval()
+        return self
+
+    @property
+    def device(self) -> torch.device:
+        return self._device
+
+    def to(self, device: Union[torch.device, str]):
+        self._device = device if isinstance(device, torch.device) else torch.device(device)
+        self.model.to(self._device)
+        return self
+
+    def can_generate(self):
+        return isinstance(self.model, GenerationMixin)
+
+    def generate(self, *args, **kwargs):
+        if not self.can_generate():
+            raise TypeError(
+                f"The current model class {self.model.__class__} is not compatible with `.generate()`, as it doesn't have a language model head."
+            )
+        return self.model.generate(*args, **kwargs)
+
+    @classmethod
+    def apply_jit_optimize(cls, model, task, use_cache, support_ipex_transformers=False):
+        return jit_trace(model, task, use_cache)
+
+
+class IPEXModelForSequenceClassification(IPEXModel):
+    auto_model_class = AutoModelForSequenceClassification
+    export_feature = "text-classification"
diff --git a/optimum/intel/ipex/modeling_decoder.py b/optimum/intel/ipex/modeling_decoder.py
new file mode 100644
index 0000000000..9835e771b9
--- /dev/null
+++ b/optimum/intel/ipex/modeling_decoder.py
@@ -0,0 +1,41 @@
+import logging
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from typing import Optional, Union
+
+from transformers import AutoModelForCausalLM, PretrainedConfig
+
+from ..generation.modeling import BaseModelForCausalLM, jit_trace
+from .modeling_base import IPEXModel
+
+
+logger = logging.getLogger(__name__)
+
+
+class IPEXModelForCausalLM(IPEXModel, BaseModelForCausalLM):
+    auto_model_class = AutoModelForCausalLM
+    export_feature = "text-generation"
+    forward = BaseModelForCausalLM.forward
+    generate = BaseModelForCausalLM.generate
+    can_generate = BaseModelForCausalLM.can_generate
+
+    def __init__(
+        self,
+        model,
+        config: PretrainedConfig = None,
+        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        use_cache: bool = True,
+        **kwargs,
+    ):
+        IPEXModel.__init__(self, model, config)
+        BaseModelForCausalLM.__init__(self, model, config, model_save_dir, use_cache, **kwargs)
+
+    @classmethod
+    def apply_jit_optimize(cls, model, task, use_cache, support_ipex_transformers):
+        if not support_ipex_transformers:
+            return jit_trace(model, task, use_cache)
+        else:
+            # from intel_extension_for_pytorch.transformers.optimize import get_dummy_input
+            # dummy_jit_inputs = get_dummy_input(task, model) # From ipex
+            # model = torch.jit.trace(model, example_input_kwargs=dummy_jit_inputs)
+            return model

From 91cee3bf6f676d78dfc107606b4e562c56c7f1f6 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Fri, 12 Jan 2024 03:59:17 -0500
Subject: [PATCH 2/8] fix style

---
 optimum/intel/ipex/modeling_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
index 6bfb474069..cbafe13595 100644
--- a/optimum/intel/ipex/modeling_base.py
+++ b/optimum/intel/ipex/modeling_base.py
@@ -20,6 +20,7 @@
 from optimum.exporters import TasksManager
 from optimum.modeling_base import OptimizedModel
 
+from ..generation.modeling import jit_trace
 from ..utils.import_utils import is_torch_version
 from ..utils.modeling_utils import patch_decoder_attention_mask
 from . import generation_tasks
@@ -30,7 +31,6 @@
 }
 
 SUPPORT_TASK_LIST = {"text-generation": SUPPORT_MODEL_LIST_FOR_CAUSAL_LM}
-from ..generation.modeling import jit_trace
 
 
 logger = logging.getLogger(__name__)

From b2564066c7dd935029fefa108847abc07b275901 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Tue, 16 Jan 2024 11:45:03 +0100
Subject: [PATCH 3/8] IPEX refactorization

---
 README.md                                     | 44 ++++++++--------
 docs/source/reference_inc.mdx                 |  4 +-
 .../text-classification/run_classification.py | 11 ----
 .../ipex/text-generation/run_generation.py    | 12 -----
 optimum/intel/__init__.py                     | 25 ++++++++--
 optimum/intel/ipex/__init__.py                | 10 ++--
 optimum/intel/ipex/modeling_base.py           | 46 ++++++++++++++++-
 optimum/intel/ipex/modeling_decoder.py        | 41 ---------------
 optimum/intel/ipex/utils.py                   |  1 +
 optimum/intel/neural_compressor/__init__.py   |  2 +-
 .../intel/neural_compressor/modeling_base.py  | 39 ++++++---------
 .../neural_compressor/modeling_decoder.py     | 50 -------------------
 optimum/intel/utils/dummy_ipex_objects.py     | 44 ++++++++++++++++
 tests/neural_compressor/test_optimization.py  |  2 +
 tests/neural_compressor/utils_tests.py        | 16 +++++-
 15 files changed, 173 insertions(+), 174 deletions(-)
 delete mode 100644 examples/ipex/text-classification/run_classification.py
 delete mode 100644 examples/ipex/text-generation/run_generation.py
 delete mode 100644 optimum/intel/ipex/modeling_decoder.py
 create mode 100644 optimum/intel/ipex/utils.py

diff --git a/README.md b/README.md
index 047c411248..899c6793e7 100644
--- a/README.md
+++ b/README.md
@@ -19,9 +19,9 @@ To install the latest release of 🤗 Optimum Intel with the corresponding requi
 
 | Accelerator                                                                                                      | Installation                                                         |
 |:-----------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------|
-| [Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction)                 | `pip install --upgrade-strategy eager "optimum[ipex]"`               |
 | [Intel Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade-strategy eager "optimum[neural-compressor]"`  |
 | [OpenVINO](https://docs.openvino.ai/latest/index.html)                                                           | `pip install --upgrade-strategy eager "optimum[openvino,nncf]"`      |
+| [Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction)                 | `pip install --upgrade-strategy eager "optimum[ipex]"`               |
 
 The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version.
 
@@ -44,26 +44,6 @@ where `extras` can be one or more of `ipex`, `neural-compressor`, `openvino`, `n
 
 # Quick tour
 
-## Intel Extension for PyTorch
-To load a model and run generation with IPEX graph mode, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class.
-```diff
-import torch
-from transformers import AutoTokenizer, pipeline
-- from transformers import AutoModelForCausalLM
-+ from optimum.intel.ipex.modeling_decoder import IPEXModelForCausalLM
-
-
-model_id = "gpt2"
-- model = AutoModelForCausalLM.from_pretrained(model_id)
-+ model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, torch_dtype=torch.bfloat16)
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
-
-print(text_generator("This is an example input"))
-```
-
-For now, we only support text-generation tasks.
-
 ## Neural Compressor
 
 Dynamic quantization can be used through the Optimum command-line interface:
@@ -222,6 +202,28 @@ Quantization aware training (QAT) is applied in order to simulate the effects of
 You can find more examples in the [documentation](https://huggingface.co/docs/optimum/intel/index).
 
 
+## Intel Extension for PyTorch
+
+To load a model and run generation with IPEX graph mode, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class.
+
+```diff
+import torch
+from transformers import AutoTokenizer, pipeline
+- from transformers import AutoModelForCausalLM
++ from optimum.intel.ipex import IPEXModelForCausalLM
+
+
+model_id = "gpt2"
+- model = AutoModelForCausalLM.from_pretrained(model_id)
++ model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, torch_dtype=torch.bfloat16)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
+results = text_generator("This is an example input")
+```
+
+For now, we only support text-generation tasks.
+
+
 ## Running the examples
 
 Check out the [`examples`](https://github.com/huggingface/optimum-intel/tree/main/examples) directory to see how 🤗 Optimum Intel can be used to optimize models and accelerate inference.
diff --git a/docs/source/reference_inc.mdx b/docs/source/reference_inc.mdx
index fcc017c89a..e1a459c094 100644
--- a/docs/source/reference_inc.mdx
+++ b/docs/source/reference_inc.mdx
@@ -43,8 +43,8 @@ specific language governing permissions and limitations under the License.
 
 ## INCModelForCausalLM
 
-[[autodoc]] neural_compressor.modeling_decoder.INCModelForCausalLM
+[[autodoc]] neural_compressor.modeling_base.INCModelForCausalLM
 
 ## INCModelForSeq2SeqLM
 
-[[autodoc]] neural_compressor.modeling_base.INCModelForSeq2SeqLM
\ No newline at end of file
+[[autodoc]] neural_compressor.modeling_base.INCModelForSeq2SeqLM
diff --git a/examples/ipex/text-classification/run_classification.py b/examples/ipex/text-classification/run_classification.py
deleted file mode 100644
index e0f6255fe0..0000000000
--- a/examples/ipex/text-classification/run_classification.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import torch
-from transformers import AutoTokenizer, pipeline
-
-from optimum.intel.ipex.modeling_base import IPEXModelForSequenceClassification
-
-
-model_id = "distilbert-base-uncased-finetuned-sst-2-english"
-model = IPEXModelForSequenceClassification.from_pretrained(model_id, export=True, torch_dtype=torch.bfloat16)
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-text_classifer = pipeline("text-classification", model=model, tokenizer=tokenizer)
-print(text_classifer("This movie is disgustingly good !"))
diff --git a/examples/ipex/text-generation/run_generation.py b/examples/ipex/text-generation/run_generation.py
deleted file mode 100644
index fe57e37cb1..0000000000
--- a/examples/ipex/text-generation/run_generation.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import torch
-from transformers import AutoTokenizer, pipeline
-
-from optimum.intel.ipex.modeling_decoder import IPEXModelForCausalLM
-
-
-model_id = "gpt2"
-model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, torch_dtype=torch.bfloat16)
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
-
-print(text_generator("This is an example input"))
diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py
index 570a451bd8..bb78afc7a3 100644
--- a/optimum/intel/__init__.py
+++ b/optimum/intel/__init__.py
@@ -35,9 +35,20 @@
     if not is_ipex_available():
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    _import_structure["utils.dummy_ipex_objects"] = ["inference_mode"]
+    from .utils import dummy_ipex_objects
+
+    _import_structure["utils.dummy_ipex_objects"] = [
+        name for name in dir(dummy_ipex_objects) if not name.startswith("_")
+    ]
 else:
-    _import_structure["ipex"] = ["inference_mode"]
+    _import_structure["ipex"] = [
+        "inference_mode",
+        "IPEXModelForCausalLM",
+        "IPEXModelForSequenceClassification",
+        "IPEXModelForMaskedLM",
+        "IPEXModelForTokenClassification",
+    ]
+
 
 try:
     if not (is_openvino_available() and is_nncf_available()):
@@ -144,9 +155,15 @@
         if not is_ipex_available():
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
-        from .utils.dummy_ipex_objects import inference_mode
+        from .utils.dummy_ipex_objects import *
     else:
-        from .ipex import inference_mode
+        from .ipex import (
+            IPEXModelForCausalLM,
+            IPEXModelForMaskedLM,
+            IPEXModelForSequenceClassification,
+            IPEXModelForTokenClassification,
+            inference_mode,
+        )
 
     try:
         if not (is_openvino_available() and is_nncf_available()):
diff --git a/optimum/intel/ipex/__init__.py b/optimum/intel/ipex/__init__.py
index 710063b980..5cd0945ac2 100644
--- a/optimum/intel/ipex/__init__.py
+++ b/optimum/intel/ipex/__init__.py
@@ -1,4 +1,8 @@
-from .inference import inference_mode
-
+from optimum.intel.ipex.modeling_base import (
+    IPEXModelForCausalLM,
+    IPEXModelForMaskedLM,
+    IPEXModelForSequenceClassification,
+    IPEXModelForTokenClassification,
+)
 
-generation_tasks = ("text-generation",)
+from .inference import inference_mode
diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
index cbafe13595..4e8198ef42 100644
--- a/optimum/intel/ipex/modeling_base.py
+++ b/optimum/intel/ipex/modeling_base.py
@@ -10,7 +10,10 @@
 from transformers import (
     AutoConfig,
     AutoModel,
+    AutoModelForCausalLM,
+    AutoModelForMaskedLM,
     AutoModelForSequenceClassification,
+    AutoModelForTokenClassification,
     GenerationMixin,
     PretrainedConfig,
 )
@@ -20,10 +23,10 @@
 from optimum.exporters import TasksManager
 from optimum.modeling_base import OptimizedModel
 
-from ..generation.modeling import jit_trace
+from ..generation.modeling import BaseModelForCausalLM, jit_trace
 from ..utils.import_utils import is_torch_version
 from ..utils.modeling_utils import patch_decoder_attention_mask
-from . import generation_tasks
+from .utils import generation_tasks
 
 
 SUPPORT_MODEL_LIST_FOR_CAUSAL_LM = {
@@ -226,3 +229,42 @@ def apply_jit_optimize(cls, model, task, use_cache, support_ipex_transformers=Fa
 class IPEXModelForSequenceClassification(IPEXModel):
     auto_model_class = AutoModelForSequenceClassification
     export_feature = "text-classification"
+
+
+class IPEXModelForMaskedLM(IPEXModel):
+    auto_model_class = AutoModelForMaskedLM
+    export_feature = "fill-mask"
+
+
+class IPEXModelForTokenClassification(IPEXModel):
+    auto_model_class = AutoModelForTokenClassification
+    export_feature = "token-classification"
+
+
+class IPEXModelForCausalLM(IPEXModel, BaseModelForCausalLM):
+    auto_model_class = AutoModelForCausalLM
+    export_feature = "text-generation"
+    forward = BaseModelForCausalLM.forward
+    generate = BaseModelForCausalLM.generate
+    can_generate = BaseModelForCausalLM.can_generate
+
+    def __init__(
+        self,
+        model,
+        config: PretrainedConfig = None,
+        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        use_cache: bool = True,
+        **kwargs,
+    ):
+        IPEXModel.__init__(self, model, config)
+        BaseModelForCausalLM.__init__(self, model, config, model_save_dir, use_cache, **kwargs)
+
+    @classmethod
+    def apply_jit_optimize(cls, model, task, use_cache, support_ipex_transformers):
+        if not support_ipex_transformers:
+            return jit_trace(model, task, use_cache)
+        else:
+            # from intel_extension_for_pytorch.transformers.optimize import get_dummy_input
+            # dummy_jit_inputs = get_dummy_input(task, model) # From ipex
+            # model = torch.jit.trace(model, example_input_kwargs=dummy_jit_inputs)
+            return model
diff --git a/optimum/intel/ipex/modeling_decoder.py b/optimum/intel/ipex/modeling_decoder.py
deleted file mode 100644
index 9835e771b9..0000000000
--- a/optimum/intel/ipex/modeling_decoder.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import logging
-from pathlib import Path
-from tempfile import TemporaryDirectory
-from typing import Optional, Union
-
-from transformers import AutoModelForCausalLM, PretrainedConfig
-
-from ..generation.modeling import BaseModelForCausalLM, jit_trace
-from .modeling_base import IPEXModel
-
-
-logger = logging.getLogger(__name__)
-
-
-class IPEXModelForCausalLM(IPEXModel, BaseModelForCausalLM):
-    auto_model_class = AutoModelForCausalLM
-    export_feature = "text-generation"
-    forward = BaseModelForCausalLM.forward
-    generate = BaseModelForCausalLM.generate
-    can_generate = BaseModelForCausalLM.can_generate
-
-    def __init__(
-        self,
-        model,
-        config: PretrainedConfig = None,
-        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
-        use_cache: bool = True,
-        **kwargs,
-    ):
-        IPEXModel.__init__(self, model, config)
-        BaseModelForCausalLM.__init__(self, model, config, model_save_dir, use_cache, **kwargs)
-
-    @classmethod
-    def apply_jit_optimize(cls, model, task, use_cache, support_ipex_transformers):
-        if not support_ipex_transformers:
-            return jit_trace(model, task, use_cache)
-        else:
-            # from intel_extension_for_pytorch.transformers.optimize import get_dummy_input
-            # dummy_jit_inputs = get_dummy_input(task, model) # From ipex
-            # model = torch.jit.trace(model, example_input_kwargs=dummy_jit_inputs)
-            return model
diff --git a/optimum/intel/ipex/utils.py b/optimum/intel/ipex/utils.py
new file mode 100644
index 0000000000..be0af76407
--- /dev/null
+++ b/optimum/intel/ipex/utils.py
@@ -0,0 +1 @@
+generation_tasks = ("text-generation",)
diff --git a/optimum/intel/neural_compressor/__init__.py b/optimum/intel/neural_compressor/__init__.py
index cb5621a333..a7170120b7 100644
--- a/optimum/intel/neural_compressor/__init__.py
+++ b/optimum/intel/neural_compressor/__init__.py
@@ -16,6 +16,7 @@
 from .configuration import INCConfig
 from .modeling_base import (
     INCModel,
+    INCModelForCausalLM,
     INCModelForMaskedLM,
     INCModelForMultipleChoice,
     INCModelForQuestionAnswering,
@@ -24,7 +25,6 @@
     INCModelForTokenClassification,
     INCModelForVision2Seq,
 )
-from .modeling_decoder import INCModelForCausalLM
 from .quantization import INCQuantizationMode, INCQuantizer
 from .trainer import INCTrainer
 from .trainer_seq2seq import INCSeq2SeqTrainer
diff --git a/optimum/intel/neural_compressor/modeling_base.py b/optimum/intel/neural_compressor/modeling_base.py
index 5cd224146a..6fca80e6f0 100644
--- a/optimum/intel/neural_compressor/modeling_base.py
+++ b/optimum/intel/neural_compressor/modeling_base.py
@@ -24,6 +24,7 @@
 from transformers import (
     AutoConfig,
     AutoModel,
+    AutoModelForCausalLM,
     AutoModelForMaskedLM,
     AutoModelForMultipleChoice,
     AutoModelForQuestionAnswering,
@@ -37,7 +38,6 @@
 )
 from transformers.modeling_utils import no_init_weights
 from transformers.models.auto.auto_factory import _get_model_class
-from transformers.utils import is_ipex_available
 from transformers.utils.generic import ContextManagers
 
 from ...modeling_base import OptimizedModel
@@ -84,18 +84,12 @@ def __init__(
         )
 
         if getattr(self.config, "backend", None) == "ipex":
-            if not is_ipex_available():
-                raise ImportError(
-                    "Intel PyTorch Extensions was not found, please make sure you've installed the package or run `pip install intel-extension-for-pytorch`"
-                )
-            # Need import intel_extension_for_pytorch for ipex model
-            import intel_extension_for_pytorch as ipex
-
-            # Just to avoid to change by ruff.
-            logger.info("intel_extension_for_pytorch version is " + ipex.__version__)
+            raise NotImplementedError(
+                "`INCModel` does not supported the loading of model resulting from IPEX, please use `IPEXModel` to load your model instead instead"
+            )
 
         # Registers the INCModelForXXX classes into the transformers AutoModel classes to avoid warnings when creating
-        # a pipeline https://github.com/huggingface/transformers/blob/cad61b68396a1a387287a8e2e2fef78a25b79383/src/transformers/pipelines/base.py#L863
+        # a pipeline https://github.com/huggingface/transformers/blob/cad61b68396a1a387287a8e2e2fef78a25b79383/src/transformers/pipelines/base.py#L863NotImplementedError
         AutoConfig.register(self.base_model_prefix, AutoConfig)
         if hasattr(self.auto_model_class, "register"):
             self.auto_model_class.register(AutoConfig, self.__class__)
@@ -149,13 +143,6 @@ def _from_pretrained(
                 f"Please check if torch quantization the model was obtained with is compatible with {_torch_version}."
             )
 
-        if getattr(config, "backend", None) == "ipex" or getattr(config, "torchscript", False):
-            # NOTE: Will improve to use load function when Intel Neural Compressor next 2.1 release.
-            # load(model_cache_path)
-            model = torch.jit.load(model_cache_path)
-            model = torch.jit.freeze(model.eval())
-            return cls(model, config=config, model_save_dir=model_save_dir, inc_config=inc_config, **kwargs)
-
         model_class = _get_model_class(config, cls.auto_model_class._model_mapping)
         # Load the state dictionary of the model to verify whether the model to get the quantization config
         state_dict = torch.load(model_cache_path, map_location="cpu")
@@ -182,13 +169,10 @@ def _from_pretrained(
     def _save_pretrained(self, save_directory: Union[str, Path]):
         output_path = os.path.join(save_directory, WEIGHTS_NAME)
 
-        if isinstance(self.model, torch.nn.Module):
-            state_dict = self.model.state_dict()
-            if self._q_config:
-                state_dict["best_configure"] = self._q_config
-            torch.save(state_dict, output_path)
-        else:
-            torch.jit.save(self.model, output_path)
+        state_dict = self.model.state_dict()
+        if self._q_config:
+            state_dict["best_configure"] = self._q_config
+        torch.save(state_dict, output_path)
 
         if self.inc_config:
             self.inc_config.save_pretrained(save_directory)
@@ -258,3 +242,8 @@ class INCModelForVision2Seq(INCModel):
 class INCModelForXLNetLM(INCModel):
     auto_model_class = XLNetLMHeadModel
     export_feature = "fill-mask"
+
+
+class INCModelForCausalLM(INCModel):
+    auto_model_class = AutoModelForCausalLM
+    export_feature = "text-generation"
diff --git a/optimum/intel/neural_compressor/modeling_decoder.py b/optimum/intel/neural_compressor/modeling_decoder.py
index e284ce4c3e..4cb767a102 100644
--- a/optimum/intel/neural_compressor/modeling_decoder.py
+++ b/optimum/intel/neural_compressor/modeling_decoder.py
@@ -11,53 +11,3 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-
-import logging
-from pathlib import Path
-from tempfile import TemporaryDirectory
-from typing import Dict, Optional, Union
-
-from transformers import AutoModelForCausalLM, PretrainedConfig
-from transformers.file_utils import add_start_docstrings
-
-from optimum.intel.generation import BaseModelForCausalLM
-
-from .modeling_base import MODEL_START_DOCSTRING, INCModel
-
-
-logger = logging.getLogger(__name__)
-
-
-@add_start_docstrings(
-    """
-    Neural-compressor Model with a causal language modeling head on top (linear layer with weights tied to the input
-    embeddings).
-    """,
-    MODEL_START_DOCSTRING,
-)
-class INCModelForCausalLM(INCModel, BaseModelForCausalLM):
-    auto_model_class = AutoModelForCausalLM
-    export_feature = "text-generation"
-    forward = BaseModelForCausalLM.forward
-    generate = BaseModelForCausalLM.generate
-    can_generate = BaseModelForCausalLM.can_generate
-
-    def __init__(
-        self,
-        model,
-        config: PretrainedConfig = None,
-        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
-        q_config: Dict = None,
-        inc_config: Dict = None,
-        use_cache: bool = True,
-        **kwargs,
-    ):
-        super(INCModelForCausalLM, self).__init__(
-            model=model,
-            config=config,
-            model_save_dir=model_save_dir,
-            q_config=q_config,
-            inc_config=inc_config,
-            use_cache=use_cache,
-            **kwargs,
-        )
diff --git a/optimum/intel/utils/dummy_ipex_objects.py b/optimum/intel/utils/dummy_ipex_objects.py
index d4962e75a2..5e36db6acd 100644
--- a/optimum/intel/utils/dummy_ipex_objects.py
+++ b/optimum/intel/utils/dummy_ipex_objects.py
@@ -20,3 +20,47 @@ class inference_mode(metaclass=DummyObject):
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["ipex"])
+
+
+class IPEXModelForSequenceClassification(metaclass=DummyObject):
+    _backends = ["ipex"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["ipex"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["ipex"])
+
+
+class IPEXModelForTokenClassification(metaclass=DummyObject):
+    _backends = ["ipex"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["ipex"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["ipex"])
+
+
+class IPEXModelForMaskedLM(metaclass=DummyObject):
+    _backends = ["ipex"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["ipex"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["ipex"])
+
+
+class IPEXModelForCausalLM(metaclass=DummyObject):
+    _backends = ["ipex"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["ipex"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["ipex"])
diff --git a/tests/neural_compressor/test_optimization.py b/tests/neural_compressor/test_optimization.py
index f28c720138..3a7717d17a 100644
--- a/tests/neural_compressor/test_optimization.py
+++ b/tests/neural_compressor/test_optimization.py
@@ -190,6 +190,8 @@ def test_ipex_static_quantization_with_smoothquant(self, task, model_name, expec
                 is_static=True,
                 load_onnx_model=False,
                 num_samples=num_samples,
+                load_inc_model=False,
+                load_ipex_model=True,
             )
 
     def test_weight_only_quantization(self):
diff --git a/tests/neural_compressor/utils_tests.py b/tests/neural_compressor/utils_tests.py
index 0a9cc0b664..a429ce6dd1 100644
--- a/tests/neural_compressor/utils_tests.py
+++ b/tests/neural_compressor/utils_tests.py
@@ -39,6 +39,14 @@
     INCSeq2SeqTrainer,
     INCStableDiffusionPipeline,
 )
+
+from optimum.intel.ipex import (
+    IPEXModelForCausalLM,
+    IPEXModelForSequenceClassification,
+    IPEXModelForMaskedLM,
+    IPEXModelForTokenClassification,
+)
+
 from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS
 from optimum.intel.utils.constant import ONNX_WEIGHTS_NAME
 from optimum.onnxruntime import ORTModelForCausalLM, ORTModelForSequenceClassification
@@ -94,6 +102,7 @@ def check_model_outputs(
         load_inc_model=True,
         num_samples=None,
         file_name=None,
+        load_ipex_model=False,
     ):
         tokens = tokenizer("This is a sample input", return_tensors="pt")
         file_name = ONNX_WEIGHTS_NAME if task != "text-generation" else "decoder_model.onnx"
@@ -111,8 +120,11 @@ def check_model_outputs(
         with torch.no_grad():
             model_outputs = q_model(**tokens)
             outputs = model_outputs["logits"] if isinstance(model_outputs, dict) else model_outputs[0]
-            if load_inc_model:
-                inc_model = eval(_HEAD_TO_AUTOMODELS[task]).from_pretrained(save_directory)
+            auto_class = _HEAD_TO_AUTOMODELS[task]
+            if load_ipex_model:
+                auto_class = auto_class.replace("INC", "IPEX")
+            if load_inc_model or load_ipex_model:
+                inc_model = eval(auto_class).from_pretrained(save_directory)
                 inc_model_outputs = inc_model(**tokens)
                 self.assertTrue(torch.allclose(inc_model_outputs["logits"], outputs, atol=1e-2))
                 # self.assertEqual(inc_config.save_onnx_model, load_onnx_model)

From 36593e2c6b9ae47092e1f5a0a6be59cb959eb4e9 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Tue, 16 Jan 2024 12:13:02 +0100
Subject: [PATCH 4/8] typo

---
 optimum/intel/neural_compressor/modeling_base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/intel/neural_compressor/modeling_base.py b/optimum/intel/neural_compressor/modeling_base.py
index 6fca80e6f0..b74c08a573 100644
--- a/optimum/intel/neural_compressor/modeling_base.py
+++ b/optimum/intel/neural_compressor/modeling_base.py
@@ -89,7 +89,7 @@ def __init__(
             )
 
         # Registers the INCModelForXXX classes into the transformers AutoModel classes to avoid warnings when creating
-        # a pipeline https://github.com/huggingface/transformers/blob/cad61b68396a1a387287a8e2e2fef78a25b79383/src/transformers/pipelines/base.py#L863NotImplementedError
+        # a pipeline https://github.com/huggingface/transformers/blob/cad61b68396a1a387287a8e2e2fef78a25b79383/src/transformers/pipelines/base.py#L863
         AutoConfig.register(self.base_model_prefix, AutoConfig)
         if hasattr(self.auto_model_class, "register"):
             self.auto_model_class.register(AutoConfig, self.__class__)

From 8a98ce9088e060b19612918d89b07139b14d1013 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Tue, 16 Jan 2024 12:21:39 +0100
Subject: [PATCH 5/8] remove use cache arg when loading model

---
 optimum/intel/ipex/modeling_base.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
index 4e8198ef42..95830366fb 100644
--- a/optimum/intel/ipex/modeling_base.py
+++ b/optimum/intel/ipex/modeling_base.py
@@ -26,7 +26,9 @@
 from ..generation.modeling import BaseModelForCausalLM, jit_trace
 from ..utils.import_utils import is_torch_version
 from ..utils.modeling_utils import patch_decoder_attention_mask
-from .utils import generation_tasks
+
+
+# from .utils import generation_tasks
 
 
 SUPPORT_MODEL_LIST_FOR_CAUSAL_LM = {
@@ -88,12 +90,9 @@ def _from_transformers(
             "subfolder": subfolder,
             "local_files_only": local_files_only,
             "force_download": force_download,
-            "use_cache": use_cache,
             "torch_dtype": torch_dtype,
             "device": "cpu",
         }
-        if task not in generation_tasks:
-            model_kwargs.pop("use_cache")
         model_type = None
         support_ipex_transformers = False
         if task in SUPPORT_TASK_LIST.keys():

From c8043075a825844b21271b31a3a2d9c5b1dba3cd Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Tue, 16 Jan 2024 12:23:51 +0100
Subject: [PATCH 6/8] fix style

---
 optimum/intel/ipex/modeling_base.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
index 95830366fb..f00def43b7 100644
--- a/optimum/intel/ipex/modeling_base.py
+++ b/optimum/intel/ipex/modeling_base.py
@@ -134,13 +134,7 @@ def _from_transformers(
             except Exception as e:
                 logger.warning(f"failed to use PyTorch jit mode due to: {e}.")
 
-        return cls(
-            model,
-            config=config,
-            use_cache=use_cache,
-            model_dtype=torch_dtype,
-            **kwargs,
-        )
+        return cls(model, config=config, use_cache=use_cache, model_dtype=torch_dtype, **kwargs)
 
     @classmethod
     def _from_pretrained(

From 01e7b3fa91753ce201705d1b11c73437ef694c77 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Tue, 16 Jan 2024 14:12:38 +0100
Subject: [PATCH 7/8] move tests

---
 tests/ipex/test_modeling.py              | 66 ++++++++++++++++++++++++
 tests/neural_compressor/test_modeling.py | 43 ---------------
 2 files changed, 66 insertions(+), 43 deletions(-)
 create mode 100644 tests/ipex/test_modeling.py

diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py
new file mode 100644
index 0000000000..7af8e0dc60
--- /dev/null
+++ b/tests/ipex/test_modeling.py
@@ -0,0 +1,66 @@
+#  Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+
+import time
+import unittest
+
+import torch
+from transformers import AutoTokenizer
+
+from optimum.intel import IPEXModelForCausalLM
+
+
+class Timer(object):
+    def __enter__(self):
+        self.elapsed = time.perf_counter()
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.elapsed = (time.perf_counter() - self.elapsed) * 1e3
+
+
+class INCModelingTest(unittest.TestCase):
+    GENERATION_LENGTH = 100
+    SPEEDUP_CACHE = 1.1
+
+    def test_compare_with_and_without_past_key_values(self):
+        model_id = "echarlaix/tiny-random-gpt2-torchscript"
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokens = tokenizer("This is a sample input", return_tensors="pt")
+
+        model_with_pkv = IPEXModelForCausalLM.from_pretrained(model_id, use_cache=True, subfolder="model_with_pkv")
+        # Warmup
+        model_with_pkv.generate(**tokens)
+        with Timer() as with_pkv_timer:
+            outputs_model_with_pkv = model_with_pkv.generate(
+                **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
+            )
+        model_without_pkv = IPEXModelForCausalLM.from_pretrained(
+            model_id, use_cache=False, subfolder="model_without_pkv"
+        )
+        # Warmup
+        model_without_pkv.generate(**tokens)
+        with Timer() as without_pkv_timer:
+            outputs_model_without_pkv = model_without_pkv.generate(
+                **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
+            )
+        self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv))
+        self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH)
+        self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH)
+        self.assertTrue(
+            without_pkv_timer.elapsed / with_pkv_timer.elapsed > self.SPEEDUP_CACHE,
+            f"With pkv latency: {with_pkv_timer.elapsed:.3f} ms, without pkv latency: {without_pkv_timer.elapsed:.3f} ms,"
+            f" speedup: {without_pkv_timer.elapsed / with_pkv_timer.elapsed:.3f}",
+        )
diff --git a/tests/neural_compressor/test_modeling.py b/tests/neural_compressor/test_modeling.py
index 8098f011c5..e0a41e76af 100644
--- a/tests/neural_compressor/test_modeling.py
+++ b/tests/neural_compressor/test_modeling.py
@@ -15,11 +15,9 @@
 
 import os
 import tempfile
-import time
 import unittest
 
 import torch
-from packaging.version import Version, parse
 from parameterized import parameterized
 from transformers import AutoTokenizer, pipeline, set_seed
 
@@ -40,7 +38,6 @@
     INCTrainer,
 )
 from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS, WEIGHTS_NAME
-from optimum.version import __version__ as _optimum_version
 
 
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
@@ -68,15 +65,6 @@
 DIFFUSERS_MODEL_NAMES_TO_TASK = (("echarlaix/stable-diffusion-v1-5-inc-int8-dynamic", "stable-diffusion"),)
 
 
-class Timer(object):
-    def __enter__(self):
-        self.elapsed = time.perf_counter()
-        return self
-
-    def __exit__(self, type, value, traceback):
-        self.elapsed = (time.perf_counter() - self.elapsed) * 1e3
-
-
 class INCModelingTest(unittest.TestCase):
     GENERATION_LENGTH = 100
     SPEEDUP_CACHE = 1.1
@@ -134,34 +122,3 @@ def test_pipeline(self, model_id, task):
             inputs *= 2
 
         pipe(*inputs)
-
-    @unittest.skipIf(parse(_optimum_version) < Version("1.14.0"), "not supported, needs optimum>=v1.14.0")
-    def test_compare_with_and_without_past_key_values(self):
-        model_id = "echarlaix/tiny-random-gpt2-torchscript"
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        tokens = tokenizer("This is a sample input", return_tensors="pt")
-
-        model_with_pkv = INCModelForCausalLM.from_pretrained(model_id, use_cache=True, subfolder="model_with_pkv")
-        # Warmup
-        model_with_pkv.generate(**tokens)
-        with Timer() as with_pkv_timer:
-            outputs_model_with_pkv = model_with_pkv.generate(
-                **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
-            )
-        model_without_pkv = INCModelForCausalLM.from_pretrained(
-            model_id, use_cache=False, subfolder="model_without_pkv"
-        )
-        # Warmup
-        model_without_pkv.generate(**tokens)
-        with Timer() as without_pkv_timer:
-            outputs_model_without_pkv = model_without_pkv.generate(
-                **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1
-            )
-        self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv))
-        self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH)
-        self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH)
-        self.assertTrue(
-            without_pkv_timer.elapsed / with_pkv_timer.elapsed > self.SPEEDUP_CACHE,
-            f"With pkv latency: {with_pkv_timer.elapsed:.3f} ms, without pkv latency: {without_pkv_timer.elapsed:.3f} ms,"
-            f" speedup: {without_pkv_timer.elapsed / with_pkv_timer.elapsed:.3f}",
-        )

From 11493f27d2e3b166ba5329283f41fe81633cb0f8 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <ella@huggingface.co>
Date: Tue, 16 Jan 2024 14:24:15 +0100
Subject: [PATCH 8/8] Add fix

---
 optimum/intel/ipex/modeling_base.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py
index f00def43b7..d8c0a89ebf 100644
--- a/optimum/intel/ipex/modeling_base.py
+++ b/optimum/intel/ipex/modeling_base.py
@@ -82,6 +82,7 @@ def _from_transformers(
     ):
         if is_torch_version("<", "2.1.0"):
             raise ImportError("`torch>=2.0.0` is needed to trace your model")
+
         task = cls.export_feature
         model_kwargs = {
             "revision": revision,
@@ -147,6 +148,7 @@ def _from_pretrained(
         cache_dir: Optional[str] = None,
         file_name: Optional[str] = WEIGHTS_NAME,
         local_files_only: bool = False,
+        subfolder: str = "",
         use_cache: bool = True,
         **kwargs,
     ):
@@ -164,6 +166,7 @@ def _from_pretrained(
                 cache_dir=cache_dir,
                 force_download=force_download,
                 local_files_only=local_files_only,
+                subfolder=subfolder,
             )
             model_save_dir = Path(model_cache_path).parent
 
@@ -174,19 +177,16 @@ def _from_pretrained(
             model_class = _get_model_class(config, cls.auto_model_class._model_mapping)
             model = model_class.from_pretrained(model_save_dir)
 
-        return cls(
-            model,
-            config=config,
-            model_save_dir=model_save_dir,
-            use_cache=use_cache,
-            **kwargs,
-        )
-
-    def _save_pretrained(self, save_directory: Union[str, Path], file_name: Optional[str] = None, **kwargs):
-        if getattr(self.config, "torchscript", False):
-            torch.jit.save(self.model, os.path.join(save_directory, WEIGHTS_NAME))
+        return cls(model, config=config, model_save_dir=model_save_dir, use_cache=use_cache, **kwargs)
+
+    def _save_pretrained(self, save_directory: Union[str, Path]):
+        output_path = os.path.join(save_directory, WEIGHTS_NAME)
+
+        if isinstance(self.model, torch.nn.Module):
+            state_dict = self.model.state_dict()
+            torch.save(state_dict, output_path)
         else:
-            torch.save(self.model, os.path.join(save_directory, WEIGHTS_NAME))
+            torch.jit.save(self.model, output_path)
 
     def forward(self, *args, **kwargs):
         return self.model(*args, **kwargs)