From bf88c927fcdced8c725fb875ba9813c724b60b5b Mon Sep 17 00:00:00 2001 From: "Feng, Jiqing" Date: Tue, 9 Jan 2024 18:14:34 -0800 Subject: [PATCH 1/8] add IPEX model and README update ipex modeling and add case for text-generation and text-classification Signed-off-by: Wang, Yi A --- README.md | 25 +- .../text-classification/run_classification.py | 11 + .../ipex/text-generation/run_generation.py | 12 + optimum/intel/generation/modeling.py | 2 +- optimum/intel/ipex/__init__.py | 3 + optimum/intel/ipex/modeling_base.py | 228 ++++++++++++++++++ optimum/intel/ipex/modeling_decoder.py | 41 ++++ 7 files changed, 320 insertions(+), 2 deletions(-) create mode 100644 examples/ipex/text-classification/run_classification.py create mode 100644 examples/ipex/text-generation/run_generation.py create mode 100644 optimum/intel/ipex/modeling_base.py create mode 100644 optimum/intel/ipex/modeling_decoder.py diff --git a/README.md b/README.md index 54d8371b5b..047c411248 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,8 @@ 🤗 Optimum Intel is the interface between the 🤗 Transformers and Diffusers libraries and the different tools and libraries provided by Intel to accelerate end-to-end pipelines on Intel architectures. +[Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) is an open-source library which provides optimizations for both eager mode and graph mode, however, compared to eager mode, graph mode in PyTorch* normally yields better performance from optimization techniques, such as operation fusion. + Intel [Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) is an open-source library enabling the usage of the most popular compression techniques such as quantization, pruning and knowledge distillation. It supports automatic accuracy-driven tuning strategies in order for users to easily generate quantized model. The users can easily apply static, dynamic and aware-training quantization approaches while giving an expected accuracy criteria. It also supports different weight pruning techniques enabling the creation of pruned model giving a predefined sparsity target. [OpenVINO](https://docs.openvino.ai/latest/index.html) is an open-source toolkit that enables high performance inference capabilities for Intel CPUs, GPUs, and special DL inference accelerators ([see](https://docs.openvino.ai/latest/openvino_docs_OV_UG_supported_plugins_Supported_Devices.html) the full list of supported devices). It is supplied with a set of tools to optimize your models with compression techniques such as quantization, pruning and knowledge distillation. Optimum Intel provides a simple interface to optimize your Transformers and Diffusers models, convert them to the OpenVINO Intermediate Representation (IR) format and run inference using OpenVINO Runtime. @@ -17,6 +19,7 @@ To install the latest release of 🤗 Optimum Intel with the corresponding requi | Accelerator | Installation | |:-----------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------| +| [Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) | `pip install --upgrade-strategy eager "optimum[ipex]"` | | [Intel Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade-strategy eager "optimum[neural-compressor]"` | | [OpenVINO](https://docs.openvino.ai/latest/index.html) | `pip install --upgrade-strategy eager "optimum[openvino,nncf]"` | @@ -37,10 +40,30 @@ or to install from source including dependencies: python -m pip install "optimum-intel[extras]"@git+https://github.com/huggingface/optimum-intel.git ``` -where `extras` can be one or more of `neural-compressor`, `openvino`, `nncf`. +where `extras` can be one or more of `ipex`, `neural-compressor`, `openvino`, `nncf`. # Quick tour +## Intel Extension for PyTorch +To load a model and run generation with IPEX graph mode, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. +```diff +import torch +from transformers import AutoTokenizer, pipeline +- from transformers import AutoModelForCausalLM ++ from optimum.intel.ipex.modeling_decoder import IPEXModelForCausalLM + + +model_id = "gpt2" +- model = AutoModelForCausalLM.from_pretrained(model_id) ++ model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, torch_dtype=torch.bfloat16) +tokenizer = AutoTokenizer.from_pretrained(model_id) +text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer) + +print(text_generator("This is an example input")) +``` + +For now, we only support text-generation tasks. + ## Neural Compressor Dynamic quantization can be used through the Optimum command-line interface: diff --git a/examples/ipex/text-classification/run_classification.py b/examples/ipex/text-classification/run_classification.py new file mode 100644 index 0000000000..e0f6255fe0 --- /dev/null +++ b/examples/ipex/text-classification/run_classification.py @@ -0,0 +1,11 @@ +import torch +from transformers import AutoTokenizer, pipeline + +from optimum.intel.ipex.modeling_base import IPEXModelForSequenceClassification + + +model_id = "distilbert-base-uncased-finetuned-sst-2-english" +model = IPEXModelForSequenceClassification.from_pretrained(model_id, export=True, torch_dtype=torch.bfloat16) +tokenizer = AutoTokenizer.from_pretrained(model_id) +text_classifer = pipeline("text-classification", model=model, tokenizer=tokenizer) +print(text_classifer("This movie is disgustingly good !")) diff --git a/examples/ipex/text-generation/run_generation.py b/examples/ipex/text-generation/run_generation.py new file mode 100644 index 0000000000..fe57e37cb1 --- /dev/null +++ b/examples/ipex/text-generation/run_generation.py @@ -0,0 +1,12 @@ +import torch +from transformers import AutoTokenizer, pipeline + +from optimum.intel.ipex.modeling_decoder import IPEXModelForCausalLM + + +model_id = "gpt2" +model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, torch_dtype=torch.bfloat16) +tokenizer = AutoTokenizer.from_pretrained(model_id) +text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer) + +print(text_generator("This is an example input")) diff --git a/optimum/intel/generation/modeling.py b/optimum/intel/generation/modeling.py index 07cb8bc98c..b2d091f87b 100644 --- a/optimum/intel/generation/modeling.py +++ b/optimum/intel/generation/modeling.py @@ -102,6 +102,7 @@ def __init__( self.model_save_dir = model_save_dir self.preprocessors = kwargs.get("preprocessors", []) self.use_cache = use_cache + ## TO do: add XPU support self._device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config) self.model_dtype = kwargs.get("model_dtype", None) @@ -282,7 +283,6 @@ def forward( inputs["position_ids"] = position_ids model_type = self.config.model_type.replace("_", "-") - if self.use_cache: if past_key_values is None: nb_pkv = 2 diff --git a/optimum/intel/ipex/__init__.py b/optimum/intel/ipex/__init__.py index bb1d9c270b..710063b980 100644 --- a/optimum/intel/ipex/__init__.py +++ b/optimum/intel/ipex/__init__.py @@ -1 +1,4 @@ from .inference import inference_mode + + +generation_tasks = ("text-generation",) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py new file mode 100644 index 0000000000..6bfb474069 --- /dev/null +++ b/optimum/intel/ipex/modeling_base.py @@ -0,0 +1,228 @@ +import logging +import os +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Optional, Union + +import intel_extension_for_pytorch as ipex +import torch +from huggingface_hub import hf_hub_download +from transformers import ( + AutoConfig, + AutoModel, + AutoModelForSequenceClassification, + GenerationMixin, + PretrainedConfig, +) +from transformers.models.auto.auto_factory import _get_model_class +from transformers.utils import WEIGHTS_NAME + +from optimum.exporters import TasksManager +from optimum.modeling_base import OptimizedModel + +from ..utils.import_utils import is_torch_version +from ..utils.modeling_utils import patch_decoder_attention_mask +from . import generation_tasks + + +SUPPORT_MODEL_LIST_FOR_CAUSAL_LM = { + # "llama": LlamaForCausalLM +} + +SUPPORT_TASK_LIST = {"text-generation": SUPPORT_MODEL_LIST_FOR_CAUSAL_LM} +from ..generation.modeling import jit_trace + + +logger = logging.getLogger(__name__) + + +class IPEXModel(OptimizedModel): + auto_model_class = AutoModel + export_feature = "feature-extraction" + base_model_prefix = "ipex_model" + + def __init__( + self, + model, + config: PretrainedConfig = None, + model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, + use_cache: bool = True, + **kwargs, + ): + OptimizedModel.__init__(self, model=model, config=config) + # To do: add XPU support + self._device = torch.device("cpu") + self.model.to(self._device) + + # Registers the IPEXModelForXXX classes into the transformers AutoModel classes to avoid warnings when creating + # a pipeline https://github.com/huggingface/transformers/blob/cad61b68396a1a387287a8e2e2fef78a25b79383/src/transformers/pipelines/base.py#L863 + AutoConfig.register(self.base_model_prefix, AutoConfig) + if hasattr(self.auto_model_class, "register"): + self.auto_model_class.register(AutoConfig, self.__class__) + + @classmethod + def _from_transformers( + cls, + model_id: str, + config: PretrainedConfig, + use_auth_token: Optional[Union[bool, str]] = None, + revision: Optional[str] = None, + force_download: bool = False, + cache_dir: Optional[str] = None, + subfolder: str = "", + local_files_only: bool = False, + use_cache: bool = True, + torch_dtype: Optional[Union[str, "torch.dtype"]] = None, + **kwargs, + ): + if is_torch_version("<", "2.1.0"): + raise ImportError("`torch>=2.0.0` is needed to trace your model") + task = cls.export_feature + model_kwargs = { + "revision": revision, + "use_auth_token": use_auth_token, + "cache_dir": cache_dir, + "subfolder": subfolder, + "local_files_only": local_files_only, + "force_download": force_download, + "use_cache": use_cache, + "torch_dtype": torch_dtype, + "device": "cpu", + } + if task not in generation_tasks: + model_kwargs.pop("use_cache") + model_type = None + support_ipex_transformers = False + if task in SUPPORT_TASK_LIST.keys(): + for name in SUPPORT_TASK_LIST[task].keys(): + if name in model_id: + support_ipex_transformers = True + model_type = name + break + + if support_ipex_transformers and task in SUPPORT_TASK_LIST and model_type in SUPPORT_TASK_LIST[task]: + # model = SUPPORT_TASK_LIST[task][model_type].from_pretrained(model_id, **model_kwargs) + pass + else: + model = TasksManager.get_model_from_task(task, model_id, **model_kwargs) + model = patch_decoder_attention_mask(model) + + model = ipex.optimize(model, dtype=torch_dtype, level="O1", auto_kernel_selection=True) + + if kwargs.pop("jit", True): + try: + traced_model = cls.apply_jit_optimize(model, task, use_cache, support_ipex_transformers) + save_dir = TemporaryDirectory() + save_dir_path = Path(save_dir.name) + torch.jit.save(traced_model, save_dir_path / WEIGHTS_NAME) + config.torchscript = True + + return cls._from_pretrained( + model_id=save_dir_path, + config=config, + use_cache=use_cache, + use_auth_token=use_auth_token, + revision=revision, + force_download=force_download, + cache_dir=cache_dir, + local_files_only=local_files_only, + model_dtype=torch_dtype, + **kwargs, + ) + except Exception as e: + logger.warning(f"failed to use PyTorch jit mode due to: {e}.") + + return cls( + model, + config=config, + use_cache=use_cache, + model_dtype=torch_dtype, + **kwargs, + ) + + @classmethod + def _from_pretrained( + cls, + model_id: Union[str, Path], + config: PretrainedConfig, + use_auth_token: Optional[Union[bool, str, None]] = None, + revision: Optional[Union[str, None]] = None, + force_download: bool = False, + cache_dir: Optional[str] = None, + file_name: Optional[str] = WEIGHTS_NAME, + local_files_only: bool = False, + use_cache: bool = True, + **kwargs, + ): + # Load the model from local directory + if os.path.isdir(model_id): + model_cache_path = os.path.join(model_id, file_name) + model_save_dir = model_id + # Download the model from the hub + else: + model_cache_path = hf_hub_download( + repo_id=model_id, + filename=file_name, + use_auth_token=use_auth_token, + revision=revision, + cache_dir=cache_dir, + force_download=force_download, + local_files_only=local_files_only, + ) + model_save_dir = Path(model_cache_path).parent + + if getattr(config, "torchscript", False): + model = torch.jit.load(model_cache_path) + torch.jit.freeze(model.eval()) + else: + model_class = _get_model_class(config, cls.auto_model_class._model_mapping) + model = model_class.from_pretrained(model_save_dir) + + return cls( + model, + config=config, + model_save_dir=model_save_dir, + use_cache=use_cache, + **kwargs, + ) + + def _save_pretrained(self, save_directory: Union[str, Path], file_name: Optional[str] = None, **kwargs): + if getattr(self.config, "torchscript", False): + torch.jit.save(self.model, os.path.join(save_directory, WEIGHTS_NAME)) + else: + torch.save(self.model, os.path.join(save_directory, WEIGHTS_NAME)) + + def forward(self, *args, **kwargs): + return self.model(*args, **kwargs) + + def eval(self): + self.model.eval() + return self + + @property + def device(self) -> torch.device: + return self._device + + def to(self, device: Union[torch.device, str]): + self._device = device if isinstance(device, torch.device) else torch.device(device) + self.model.to(self._device) + return self + + def can_generate(self): + return isinstance(self.model, GenerationMixin) + + def generate(self, *args, **kwargs): + if not self.can_generate(): + raise TypeError( + f"The current model class {self.model.__class__} is not compatible with `.generate()`, as it doesn't have a language model head." + ) + return self.model.generate(*args, **kwargs) + + @classmethod + def apply_jit_optimize(cls, model, task, use_cache, support_ipex_transformers=False): + return jit_trace(model, task, use_cache) + + +class IPEXModelForSequenceClassification(IPEXModel): + auto_model_class = AutoModelForSequenceClassification + export_feature = "text-classification" diff --git a/optimum/intel/ipex/modeling_decoder.py b/optimum/intel/ipex/modeling_decoder.py new file mode 100644 index 0000000000..9835e771b9 --- /dev/null +++ b/optimum/intel/ipex/modeling_decoder.py @@ -0,0 +1,41 @@ +import logging +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Optional, Union + +from transformers import AutoModelForCausalLM, PretrainedConfig + +from ..generation.modeling import BaseModelForCausalLM, jit_trace +from .modeling_base import IPEXModel + + +logger = logging.getLogger(__name__) + + +class IPEXModelForCausalLM(IPEXModel, BaseModelForCausalLM): + auto_model_class = AutoModelForCausalLM + export_feature = "text-generation" + forward = BaseModelForCausalLM.forward + generate = BaseModelForCausalLM.generate + can_generate = BaseModelForCausalLM.can_generate + + def __init__( + self, + model, + config: PretrainedConfig = None, + model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, + use_cache: bool = True, + **kwargs, + ): + IPEXModel.__init__(self, model, config) + BaseModelForCausalLM.__init__(self, model, config, model_save_dir, use_cache, **kwargs) + + @classmethod + def apply_jit_optimize(cls, model, task, use_cache, support_ipex_transformers): + if not support_ipex_transformers: + return jit_trace(model, task, use_cache) + else: + # from intel_extension_for_pytorch.transformers.optimize import get_dummy_input + # dummy_jit_inputs = get_dummy_input(task, model) # From ipex + # model = torch.jit.trace(model, example_input_kwargs=dummy_jit_inputs) + return model From 91cee3bf6f676d78dfc107606b4e562c56c7f1f6 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Fri, 12 Jan 2024 03:59:17 -0500 Subject: [PATCH 2/8] fix style --- optimum/intel/ipex/modeling_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index 6bfb474069..cbafe13595 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -20,6 +20,7 @@ from optimum.exporters import TasksManager from optimum.modeling_base import OptimizedModel +from ..generation.modeling import jit_trace from ..utils.import_utils import is_torch_version from ..utils.modeling_utils import patch_decoder_attention_mask from . import generation_tasks @@ -30,7 +31,6 @@ } SUPPORT_TASK_LIST = {"text-generation": SUPPORT_MODEL_LIST_FOR_CAUSAL_LM} -from ..generation.modeling import jit_trace logger = logging.getLogger(__name__) From b2564066c7dd935029fefa108847abc07b275901 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 16 Jan 2024 11:45:03 +0100 Subject: [PATCH 3/8] IPEX refactorization --- README.md | 44 ++++++++-------- docs/source/reference_inc.mdx | 4 +- .../text-classification/run_classification.py | 11 ---- .../ipex/text-generation/run_generation.py | 12 ----- optimum/intel/__init__.py | 25 ++++++++-- optimum/intel/ipex/__init__.py | 10 ++-- optimum/intel/ipex/modeling_base.py | 46 ++++++++++++++++- optimum/intel/ipex/modeling_decoder.py | 41 --------------- optimum/intel/ipex/utils.py | 1 + optimum/intel/neural_compressor/__init__.py | 2 +- .../intel/neural_compressor/modeling_base.py | 39 ++++++--------- .../neural_compressor/modeling_decoder.py | 50 ------------------- optimum/intel/utils/dummy_ipex_objects.py | 44 ++++++++++++++++ tests/neural_compressor/test_optimization.py | 2 + tests/neural_compressor/utils_tests.py | 16 +++++- 15 files changed, 173 insertions(+), 174 deletions(-) delete mode 100644 examples/ipex/text-classification/run_classification.py delete mode 100644 examples/ipex/text-generation/run_generation.py delete mode 100644 optimum/intel/ipex/modeling_decoder.py create mode 100644 optimum/intel/ipex/utils.py diff --git a/README.md b/README.md index 047c411248..899c6793e7 100644 --- a/README.md +++ b/README.md @@ -19,9 +19,9 @@ To install the latest release of 🤗 Optimum Intel with the corresponding requi | Accelerator | Installation | |:-----------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------| -| [Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) | `pip install --upgrade-strategy eager "optimum[ipex]"` | | [Intel Neural Compressor](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade-strategy eager "optimum[neural-compressor]"` | | [OpenVINO](https://docs.openvino.ai/latest/index.html) | `pip install --upgrade-strategy eager "optimum[openvino,nncf]"` | +| [Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/#introduction) | `pip install --upgrade-strategy eager "optimum[ipex]"` | The `--upgrade-strategy eager` option is needed to ensure `optimum-intel` is upgraded to the latest version. @@ -44,26 +44,6 @@ where `extras` can be one or more of `ipex`, `neural-compressor`, `openvino`, `n # Quick tour -## Intel Extension for PyTorch -To load a model and run generation with IPEX graph mode, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. -```diff -import torch -from transformers import AutoTokenizer, pipeline -- from transformers import AutoModelForCausalLM -+ from optimum.intel.ipex.modeling_decoder import IPEXModelForCausalLM - - -model_id = "gpt2" -- model = AutoModelForCausalLM.from_pretrained(model_id) -+ model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, torch_dtype=torch.bfloat16) -tokenizer = AutoTokenizer.from_pretrained(model_id) -text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer) - -print(text_generator("This is an example input")) -``` - -For now, we only support text-generation tasks. - ## Neural Compressor Dynamic quantization can be used through the Optimum command-line interface: @@ -222,6 +202,28 @@ Quantization aware training (QAT) is applied in order to simulate the effects of You can find more examples in the [documentation](https://huggingface.co/docs/optimum/intel/index). +## Intel Extension for PyTorch + +To load a model and run generation with IPEX graph mode, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. + +```diff +import torch +from transformers import AutoTokenizer, pipeline +- from transformers import AutoModelForCausalLM ++ from optimum.intel.ipex import IPEXModelForCausalLM + + +model_id = "gpt2" +- model = AutoModelForCausalLM.from_pretrained(model_id) ++ model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, torch_dtype=torch.bfloat16) +tokenizer = AutoTokenizer.from_pretrained(model_id) +text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer) +results = text_generator("This is an example input") +``` + +For now, we only support text-generation tasks. + + ## Running the examples Check out the [`examples`](https://github.com/huggingface/optimum-intel/tree/main/examples) directory to see how 🤗 Optimum Intel can be used to optimize models and accelerate inference. diff --git a/docs/source/reference_inc.mdx b/docs/source/reference_inc.mdx index fcc017c89a..e1a459c094 100644 --- a/docs/source/reference_inc.mdx +++ b/docs/source/reference_inc.mdx @@ -43,8 +43,8 @@ specific language governing permissions and limitations under the License. ## INCModelForCausalLM -[[autodoc]] neural_compressor.modeling_decoder.INCModelForCausalLM +[[autodoc]] neural_compressor.modeling_base.INCModelForCausalLM ## INCModelForSeq2SeqLM -[[autodoc]] neural_compressor.modeling_base.INCModelForSeq2SeqLM \ No newline at end of file +[[autodoc]] neural_compressor.modeling_base.INCModelForSeq2SeqLM diff --git a/examples/ipex/text-classification/run_classification.py b/examples/ipex/text-classification/run_classification.py deleted file mode 100644 index e0f6255fe0..0000000000 --- a/examples/ipex/text-classification/run_classification.py +++ /dev/null @@ -1,11 +0,0 @@ -import torch -from transformers import AutoTokenizer, pipeline - -from optimum.intel.ipex.modeling_base import IPEXModelForSequenceClassification - - -model_id = "distilbert-base-uncased-finetuned-sst-2-english" -model = IPEXModelForSequenceClassification.from_pretrained(model_id, export=True, torch_dtype=torch.bfloat16) -tokenizer = AutoTokenizer.from_pretrained(model_id) -text_classifer = pipeline("text-classification", model=model, tokenizer=tokenizer) -print(text_classifer("This movie is disgustingly good !")) diff --git a/examples/ipex/text-generation/run_generation.py b/examples/ipex/text-generation/run_generation.py deleted file mode 100644 index fe57e37cb1..0000000000 --- a/examples/ipex/text-generation/run_generation.py +++ /dev/null @@ -1,12 +0,0 @@ -import torch -from transformers import AutoTokenizer, pipeline - -from optimum.intel.ipex.modeling_decoder import IPEXModelForCausalLM - - -model_id = "gpt2" -model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, torch_dtype=torch.bfloat16) -tokenizer = AutoTokenizer.from_pretrained(model_id) -text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer) - -print(text_generator("This is an example input")) diff --git a/optimum/intel/__init__.py b/optimum/intel/__init__.py index 570a451bd8..bb78afc7a3 100644 --- a/optimum/intel/__init__.py +++ b/optimum/intel/__init__.py @@ -35,9 +35,20 @@ if not is_ipex_available(): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - _import_structure["utils.dummy_ipex_objects"] = ["inference_mode"] + from .utils import dummy_ipex_objects + + _import_structure["utils.dummy_ipex_objects"] = [ + name for name in dir(dummy_ipex_objects) if not name.startswith("_") + ] else: - _import_structure["ipex"] = ["inference_mode"] + _import_structure["ipex"] = [ + "inference_mode", + "IPEXModelForCausalLM", + "IPEXModelForSequenceClassification", + "IPEXModelForMaskedLM", + "IPEXModelForTokenClassification", + ] + try: if not (is_openvino_available() and is_nncf_available()): @@ -144,9 +155,15 @@ if not is_ipex_available(): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: - from .utils.dummy_ipex_objects import inference_mode + from .utils.dummy_ipex_objects import * else: - from .ipex import inference_mode + from .ipex import ( + IPEXModelForCausalLM, + IPEXModelForMaskedLM, + IPEXModelForSequenceClassification, + IPEXModelForTokenClassification, + inference_mode, + ) try: if not (is_openvino_available() and is_nncf_available()): diff --git a/optimum/intel/ipex/__init__.py b/optimum/intel/ipex/__init__.py index 710063b980..5cd0945ac2 100644 --- a/optimum/intel/ipex/__init__.py +++ b/optimum/intel/ipex/__init__.py @@ -1,4 +1,8 @@ -from .inference import inference_mode - +from optimum.intel.ipex.modeling_base import ( + IPEXModelForCausalLM, + IPEXModelForMaskedLM, + IPEXModelForSequenceClassification, + IPEXModelForTokenClassification, +) -generation_tasks = ("text-generation",) +from .inference import inference_mode diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index cbafe13595..4e8198ef42 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -10,7 +10,10 @@ from transformers import ( AutoConfig, AutoModel, + AutoModelForCausalLM, + AutoModelForMaskedLM, AutoModelForSequenceClassification, + AutoModelForTokenClassification, GenerationMixin, PretrainedConfig, ) @@ -20,10 +23,10 @@ from optimum.exporters import TasksManager from optimum.modeling_base import OptimizedModel -from ..generation.modeling import jit_trace +from ..generation.modeling import BaseModelForCausalLM, jit_trace from ..utils.import_utils import is_torch_version from ..utils.modeling_utils import patch_decoder_attention_mask -from . import generation_tasks +from .utils import generation_tasks SUPPORT_MODEL_LIST_FOR_CAUSAL_LM = { @@ -226,3 +229,42 @@ def apply_jit_optimize(cls, model, task, use_cache, support_ipex_transformers=Fa class IPEXModelForSequenceClassification(IPEXModel): auto_model_class = AutoModelForSequenceClassification export_feature = "text-classification" + + +class IPEXModelForMaskedLM(IPEXModel): + auto_model_class = AutoModelForMaskedLM + export_feature = "fill-mask" + + +class IPEXModelForTokenClassification(IPEXModel): + auto_model_class = AutoModelForTokenClassification + export_feature = "token-classification" + + +class IPEXModelForCausalLM(IPEXModel, BaseModelForCausalLM): + auto_model_class = AutoModelForCausalLM + export_feature = "text-generation" + forward = BaseModelForCausalLM.forward + generate = BaseModelForCausalLM.generate + can_generate = BaseModelForCausalLM.can_generate + + def __init__( + self, + model, + config: PretrainedConfig = None, + model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, + use_cache: bool = True, + **kwargs, + ): + IPEXModel.__init__(self, model, config) + BaseModelForCausalLM.__init__(self, model, config, model_save_dir, use_cache, **kwargs) + + @classmethod + def apply_jit_optimize(cls, model, task, use_cache, support_ipex_transformers): + if not support_ipex_transformers: + return jit_trace(model, task, use_cache) + else: + # from intel_extension_for_pytorch.transformers.optimize import get_dummy_input + # dummy_jit_inputs = get_dummy_input(task, model) # From ipex + # model = torch.jit.trace(model, example_input_kwargs=dummy_jit_inputs) + return model diff --git a/optimum/intel/ipex/modeling_decoder.py b/optimum/intel/ipex/modeling_decoder.py deleted file mode 100644 index 9835e771b9..0000000000 --- a/optimum/intel/ipex/modeling_decoder.py +++ /dev/null @@ -1,41 +0,0 @@ -import logging -from pathlib import Path -from tempfile import TemporaryDirectory -from typing import Optional, Union - -from transformers import AutoModelForCausalLM, PretrainedConfig - -from ..generation.modeling import BaseModelForCausalLM, jit_trace -from .modeling_base import IPEXModel - - -logger = logging.getLogger(__name__) - - -class IPEXModelForCausalLM(IPEXModel, BaseModelForCausalLM): - auto_model_class = AutoModelForCausalLM - export_feature = "text-generation" - forward = BaseModelForCausalLM.forward - generate = BaseModelForCausalLM.generate - can_generate = BaseModelForCausalLM.can_generate - - def __init__( - self, - model, - config: PretrainedConfig = None, - model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, - use_cache: bool = True, - **kwargs, - ): - IPEXModel.__init__(self, model, config) - BaseModelForCausalLM.__init__(self, model, config, model_save_dir, use_cache, **kwargs) - - @classmethod - def apply_jit_optimize(cls, model, task, use_cache, support_ipex_transformers): - if not support_ipex_transformers: - return jit_trace(model, task, use_cache) - else: - # from intel_extension_for_pytorch.transformers.optimize import get_dummy_input - # dummy_jit_inputs = get_dummy_input(task, model) # From ipex - # model = torch.jit.trace(model, example_input_kwargs=dummy_jit_inputs) - return model diff --git a/optimum/intel/ipex/utils.py b/optimum/intel/ipex/utils.py new file mode 100644 index 0000000000..be0af76407 --- /dev/null +++ b/optimum/intel/ipex/utils.py @@ -0,0 +1 @@ +generation_tasks = ("text-generation",) diff --git a/optimum/intel/neural_compressor/__init__.py b/optimum/intel/neural_compressor/__init__.py index cb5621a333..a7170120b7 100644 --- a/optimum/intel/neural_compressor/__init__.py +++ b/optimum/intel/neural_compressor/__init__.py @@ -16,6 +16,7 @@ from .configuration import INCConfig from .modeling_base import ( INCModel, + INCModelForCausalLM, INCModelForMaskedLM, INCModelForMultipleChoice, INCModelForQuestionAnswering, @@ -24,7 +25,6 @@ INCModelForTokenClassification, INCModelForVision2Seq, ) -from .modeling_decoder import INCModelForCausalLM from .quantization import INCQuantizationMode, INCQuantizer from .trainer import INCTrainer from .trainer_seq2seq import INCSeq2SeqTrainer diff --git a/optimum/intel/neural_compressor/modeling_base.py b/optimum/intel/neural_compressor/modeling_base.py index 5cd224146a..6fca80e6f0 100644 --- a/optimum/intel/neural_compressor/modeling_base.py +++ b/optimum/intel/neural_compressor/modeling_base.py @@ -24,6 +24,7 @@ from transformers import ( AutoConfig, AutoModel, + AutoModelForCausalLM, AutoModelForMaskedLM, AutoModelForMultipleChoice, AutoModelForQuestionAnswering, @@ -37,7 +38,6 @@ ) from transformers.modeling_utils import no_init_weights from transformers.models.auto.auto_factory import _get_model_class -from transformers.utils import is_ipex_available from transformers.utils.generic import ContextManagers from ...modeling_base import OptimizedModel @@ -84,18 +84,12 @@ def __init__( ) if getattr(self.config, "backend", None) == "ipex": - if not is_ipex_available(): - raise ImportError( - "Intel PyTorch Extensions was not found, please make sure you've installed the package or run `pip install intel-extension-for-pytorch`" - ) - # Need import intel_extension_for_pytorch for ipex model - import intel_extension_for_pytorch as ipex - - # Just to avoid to change by ruff. - logger.info("intel_extension_for_pytorch version is " + ipex.__version__) + raise NotImplementedError( + "`INCModel` does not supported the loading of model resulting from IPEX, please use `IPEXModel` to load your model instead instead" + ) # Registers the INCModelForXXX classes into the transformers AutoModel classes to avoid warnings when creating - # a pipeline https://github.com/huggingface/transformers/blob/cad61b68396a1a387287a8e2e2fef78a25b79383/src/transformers/pipelines/base.py#L863 + # a pipeline https://github.com/huggingface/transformers/blob/cad61b68396a1a387287a8e2e2fef78a25b79383/src/transformers/pipelines/base.py#L863NotImplementedError AutoConfig.register(self.base_model_prefix, AutoConfig) if hasattr(self.auto_model_class, "register"): self.auto_model_class.register(AutoConfig, self.__class__) @@ -149,13 +143,6 @@ def _from_pretrained( f"Please check if torch quantization the model was obtained with is compatible with {_torch_version}." ) - if getattr(config, "backend", None) == "ipex" or getattr(config, "torchscript", False): - # NOTE: Will improve to use load function when Intel Neural Compressor next 2.1 release. - # load(model_cache_path) - model = torch.jit.load(model_cache_path) - model = torch.jit.freeze(model.eval()) - return cls(model, config=config, model_save_dir=model_save_dir, inc_config=inc_config, **kwargs) - model_class = _get_model_class(config, cls.auto_model_class._model_mapping) # Load the state dictionary of the model to verify whether the model to get the quantization config state_dict = torch.load(model_cache_path, map_location="cpu") @@ -182,13 +169,10 @@ def _from_pretrained( def _save_pretrained(self, save_directory: Union[str, Path]): output_path = os.path.join(save_directory, WEIGHTS_NAME) - if isinstance(self.model, torch.nn.Module): - state_dict = self.model.state_dict() - if self._q_config: - state_dict["best_configure"] = self._q_config - torch.save(state_dict, output_path) - else: - torch.jit.save(self.model, output_path) + state_dict = self.model.state_dict() + if self._q_config: + state_dict["best_configure"] = self._q_config + torch.save(state_dict, output_path) if self.inc_config: self.inc_config.save_pretrained(save_directory) @@ -258,3 +242,8 @@ class INCModelForVision2Seq(INCModel): class INCModelForXLNetLM(INCModel): auto_model_class = XLNetLMHeadModel export_feature = "fill-mask" + + +class INCModelForCausalLM(INCModel): + auto_model_class = AutoModelForCausalLM + export_feature = "text-generation" diff --git a/optimum/intel/neural_compressor/modeling_decoder.py b/optimum/intel/neural_compressor/modeling_decoder.py index e284ce4c3e..4cb767a102 100644 --- a/optimum/intel/neural_compressor/modeling_decoder.py +++ b/optimum/intel/neural_compressor/modeling_decoder.py @@ -11,53 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -import logging -from pathlib import Path -from tempfile import TemporaryDirectory -from typing import Dict, Optional, Union - -from transformers import AutoModelForCausalLM, PretrainedConfig -from transformers.file_utils import add_start_docstrings - -from optimum.intel.generation import BaseModelForCausalLM - -from .modeling_base import MODEL_START_DOCSTRING, INCModel - - -logger = logging.getLogger(__name__) - - -@add_start_docstrings( - """ - Neural-compressor Model with a causal language modeling head on top (linear layer with weights tied to the input - embeddings). - """, - MODEL_START_DOCSTRING, -) -class INCModelForCausalLM(INCModel, BaseModelForCausalLM): - auto_model_class = AutoModelForCausalLM - export_feature = "text-generation" - forward = BaseModelForCausalLM.forward - generate = BaseModelForCausalLM.generate - can_generate = BaseModelForCausalLM.can_generate - - def __init__( - self, - model, - config: PretrainedConfig = None, - model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, - q_config: Dict = None, - inc_config: Dict = None, - use_cache: bool = True, - **kwargs, - ): - super(INCModelForCausalLM, self).__init__( - model=model, - config=config, - model_save_dir=model_save_dir, - q_config=q_config, - inc_config=inc_config, - use_cache=use_cache, - **kwargs, - ) diff --git a/optimum/intel/utils/dummy_ipex_objects.py b/optimum/intel/utils/dummy_ipex_objects.py index d4962e75a2..5e36db6acd 100644 --- a/optimum/intel/utils/dummy_ipex_objects.py +++ b/optimum/intel/utils/dummy_ipex_objects.py @@ -20,3 +20,47 @@ class inference_mode(metaclass=DummyObject): def __init__(self, *args, **kwargs): requires_backends(self, ["ipex"]) + + +class IPEXModelForSequenceClassification(metaclass=DummyObject): + _backends = ["ipex"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["ipex"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["ipex"]) + + +class IPEXModelForTokenClassification(metaclass=DummyObject): + _backends = ["ipex"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["ipex"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["ipex"]) + + +class IPEXModelForMaskedLM(metaclass=DummyObject): + _backends = ["ipex"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["ipex"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["ipex"]) + + +class IPEXModelForCausalLM(metaclass=DummyObject): + _backends = ["ipex"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["ipex"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["ipex"]) diff --git a/tests/neural_compressor/test_optimization.py b/tests/neural_compressor/test_optimization.py index f28c720138..3a7717d17a 100644 --- a/tests/neural_compressor/test_optimization.py +++ b/tests/neural_compressor/test_optimization.py @@ -190,6 +190,8 @@ def test_ipex_static_quantization_with_smoothquant(self, task, model_name, expec is_static=True, load_onnx_model=False, num_samples=num_samples, + load_inc_model=False, + load_ipex_model=True, ) def test_weight_only_quantization(self): diff --git a/tests/neural_compressor/utils_tests.py b/tests/neural_compressor/utils_tests.py index 0a9cc0b664..a429ce6dd1 100644 --- a/tests/neural_compressor/utils_tests.py +++ b/tests/neural_compressor/utils_tests.py @@ -39,6 +39,14 @@ INCSeq2SeqTrainer, INCStableDiffusionPipeline, ) + +from optimum.intel.ipex import ( + IPEXModelForCausalLM, + IPEXModelForSequenceClassification, + IPEXModelForMaskedLM, + IPEXModelForTokenClassification, +) + from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS from optimum.intel.utils.constant import ONNX_WEIGHTS_NAME from optimum.onnxruntime import ORTModelForCausalLM, ORTModelForSequenceClassification @@ -94,6 +102,7 @@ def check_model_outputs( load_inc_model=True, num_samples=None, file_name=None, + load_ipex_model=False, ): tokens = tokenizer("This is a sample input", return_tensors="pt") file_name = ONNX_WEIGHTS_NAME if task != "text-generation" else "decoder_model.onnx" @@ -111,8 +120,11 @@ def check_model_outputs( with torch.no_grad(): model_outputs = q_model(**tokens) outputs = model_outputs["logits"] if isinstance(model_outputs, dict) else model_outputs[0] - if load_inc_model: - inc_model = eval(_HEAD_TO_AUTOMODELS[task]).from_pretrained(save_directory) + auto_class = _HEAD_TO_AUTOMODELS[task] + if load_ipex_model: + auto_class = auto_class.replace("INC", "IPEX") + if load_inc_model or load_ipex_model: + inc_model = eval(auto_class).from_pretrained(save_directory) inc_model_outputs = inc_model(**tokens) self.assertTrue(torch.allclose(inc_model_outputs["logits"], outputs, atol=1e-2)) # self.assertEqual(inc_config.save_onnx_model, load_onnx_model) From 36593e2c6b9ae47092e1f5a0a6be59cb959eb4e9 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 16 Jan 2024 12:13:02 +0100 Subject: [PATCH 4/8] typo --- optimum/intel/neural_compressor/modeling_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/intel/neural_compressor/modeling_base.py b/optimum/intel/neural_compressor/modeling_base.py index 6fca80e6f0..b74c08a573 100644 --- a/optimum/intel/neural_compressor/modeling_base.py +++ b/optimum/intel/neural_compressor/modeling_base.py @@ -89,7 +89,7 @@ def __init__( ) # Registers the INCModelForXXX classes into the transformers AutoModel classes to avoid warnings when creating - # a pipeline https://github.com/huggingface/transformers/blob/cad61b68396a1a387287a8e2e2fef78a25b79383/src/transformers/pipelines/base.py#L863NotImplementedError + # a pipeline https://github.com/huggingface/transformers/blob/cad61b68396a1a387287a8e2e2fef78a25b79383/src/transformers/pipelines/base.py#L863 AutoConfig.register(self.base_model_prefix, AutoConfig) if hasattr(self.auto_model_class, "register"): self.auto_model_class.register(AutoConfig, self.__class__) From 8a98ce9088e060b19612918d89b07139b14d1013 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 16 Jan 2024 12:21:39 +0100 Subject: [PATCH 5/8] remove use cache arg when loading model --- optimum/intel/ipex/modeling_base.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index 4e8198ef42..95830366fb 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -26,7 +26,9 @@ from ..generation.modeling import BaseModelForCausalLM, jit_trace from ..utils.import_utils import is_torch_version from ..utils.modeling_utils import patch_decoder_attention_mask -from .utils import generation_tasks + + +# from .utils import generation_tasks SUPPORT_MODEL_LIST_FOR_CAUSAL_LM = { @@ -88,12 +90,9 @@ def _from_transformers( "subfolder": subfolder, "local_files_only": local_files_only, "force_download": force_download, - "use_cache": use_cache, "torch_dtype": torch_dtype, "device": "cpu", } - if task not in generation_tasks: - model_kwargs.pop("use_cache") model_type = None support_ipex_transformers = False if task in SUPPORT_TASK_LIST.keys(): From c8043075a825844b21271b31a3a2d9c5b1dba3cd Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 16 Jan 2024 12:23:51 +0100 Subject: [PATCH 6/8] fix style --- optimum/intel/ipex/modeling_base.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index 95830366fb..f00def43b7 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -134,13 +134,7 @@ def _from_transformers( except Exception as e: logger.warning(f"failed to use PyTorch jit mode due to: {e}.") - return cls( - model, - config=config, - use_cache=use_cache, - model_dtype=torch_dtype, - **kwargs, - ) + return cls(model, config=config, use_cache=use_cache, model_dtype=torch_dtype, **kwargs) @classmethod def _from_pretrained( From 01e7b3fa91753ce201705d1b11c73437ef694c77 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 16 Jan 2024 14:12:38 +0100 Subject: [PATCH 7/8] move tests --- tests/ipex/test_modeling.py | 66 ++++++++++++++++++++++++ tests/neural_compressor/test_modeling.py | 43 --------------- 2 files changed, 66 insertions(+), 43 deletions(-) create mode 100644 tests/ipex/test_modeling.py diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py new file mode 100644 index 0000000000..7af8e0dc60 --- /dev/null +++ b/tests/ipex/test_modeling.py @@ -0,0 +1,66 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import time +import unittest + +import torch +from transformers import AutoTokenizer + +from optimum.intel import IPEXModelForCausalLM + + +class Timer(object): + def __enter__(self): + self.elapsed = time.perf_counter() + return self + + def __exit__(self, type, value, traceback): + self.elapsed = (time.perf_counter() - self.elapsed) * 1e3 + + +class INCModelingTest(unittest.TestCase): + GENERATION_LENGTH = 100 + SPEEDUP_CACHE = 1.1 + + def test_compare_with_and_without_past_key_values(self): + model_id = "echarlaix/tiny-random-gpt2-torchscript" + tokenizer = AutoTokenizer.from_pretrained(model_id) + tokens = tokenizer("This is a sample input", return_tensors="pt") + + model_with_pkv = IPEXModelForCausalLM.from_pretrained(model_id, use_cache=True, subfolder="model_with_pkv") + # Warmup + model_with_pkv.generate(**tokens) + with Timer() as with_pkv_timer: + outputs_model_with_pkv = model_with_pkv.generate( + **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 + ) + model_without_pkv = IPEXModelForCausalLM.from_pretrained( + model_id, use_cache=False, subfolder="model_without_pkv" + ) + # Warmup + model_without_pkv.generate(**tokens) + with Timer() as without_pkv_timer: + outputs_model_without_pkv = model_without_pkv.generate( + **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 + ) + self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) + self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH) + self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH) + self.assertTrue( + without_pkv_timer.elapsed / with_pkv_timer.elapsed > self.SPEEDUP_CACHE, + f"With pkv latency: {with_pkv_timer.elapsed:.3f} ms, without pkv latency: {without_pkv_timer.elapsed:.3f} ms," + f" speedup: {without_pkv_timer.elapsed / with_pkv_timer.elapsed:.3f}", + ) diff --git a/tests/neural_compressor/test_modeling.py b/tests/neural_compressor/test_modeling.py index 8098f011c5..e0a41e76af 100644 --- a/tests/neural_compressor/test_modeling.py +++ b/tests/neural_compressor/test_modeling.py @@ -15,11 +15,9 @@ import os import tempfile -import time import unittest import torch -from packaging.version import Version, parse from parameterized import parameterized from transformers import AutoTokenizer, pipeline, set_seed @@ -40,7 +38,6 @@ INCTrainer, ) from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS, WEIGHTS_NAME -from optimum.version import __version__ as _optimum_version os.environ["CUDA_VISIBLE_DEVICES"] = "" @@ -68,15 +65,6 @@ DIFFUSERS_MODEL_NAMES_TO_TASK = (("echarlaix/stable-diffusion-v1-5-inc-int8-dynamic", "stable-diffusion"),) -class Timer(object): - def __enter__(self): - self.elapsed = time.perf_counter() - return self - - def __exit__(self, type, value, traceback): - self.elapsed = (time.perf_counter() - self.elapsed) * 1e3 - - class INCModelingTest(unittest.TestCase): GENERATION_LENGTH = 100 SPEEDUP_CACHE = 1.1 @@ -134,34 +122,3 @@ def test_pipeline(self, model_id, task): inputs *= 2 pipe(*inputs) - - @unittest.skipIf(parse(_optimum_version) < Version("1.14.0"), "not supported, needs optimum>=v1.14.0") - def test_compare_with_and_without_past_key_values(self): - model_id = "echarlaix/tiny-random-gpt2-torchscript" - tokenizer = AutoTokenizer.from_pretrained(model_id) - tokens = tokenizer("This is a sample input", return_tensors="pt") - - model_with_pkv = INCModelForCausalLM.from_pretrained(model_id, use_cache=True, subfolder="model_with_pkv") - # Warmup - model_with_pkv.generate(**tokens) - with Timer() as with_pkv_timer: - outputs_model_with_pkv = model_with_pkv.generate( - **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 - ) - model_without_pkv = INCModelForCausalLM.from_pretrained( - model_id, use_cache=False, subfolder="model_without_pkv" - ) - # Warmup - model_without_pkv.generate(**tokens) - with Timer() as without_pkv_timer: - outputs_model_without_pkv = model_without_pkv.generate( - **tokens, min_length=self.GENERATION_LENGTH, max_length=self.GENERATION_LENGTH, num_beams=1 - ) - self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) - self.assertEqual(outputs_model_with_pkv.shape[1], self.GENERATION_LENGTH) - self.assertEqual(outputs_model_without_pkv.shape[1], self.GENERATION_LENGTH) - self.assertTrue( - without_pkv_timer.elapsed / with_pkv_timer.elapsed > self.SPEEDUP_CACHE, - f"With pkv latency: {with_pkv_timer.elapsed:.3f} ms, without pkv latency: {without_pkv_timer.elapsed:.3f} ms," - f" speedup: {without_pkv_timer.elapsed / with_pkv_timer.elapsed:.3f}", - ) From 11493f27d2e3b166ba5329283f41fe81633cb0f8 Mon Sep 17 00:00:00 2001 From: Ella Charlaix Date: Tue, 16 Jan 2024 14:24:15 +0100 Subject: [PATCH 8/8] Add fix --- optimum/intel/ipex/modeling_base.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index f00def43b7..d8c0a89ebf 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -82,6 +82,7 @@ def _from_transformers( ): if is_torch_version("<", "2.1.0"): raise ImportError("`torch>=2.0.0` is needed to trace your model") + task = cls.export_feature model_kwargs = { "revision": revision, @@ -147,6 +148,7 @@ def _from_pretrained( cache_dir: Optional[str] = None, file_name: Optional[str] = WEIGHTS_NAME, local_files_only: bool = False, + subfolder: str = "", use_cache: bool = True, **kwargs, ): @@ -164,6 +166,7 @@ def _from_pretrained( cache_dir=cache_dir, force_download=force_download, local_files_only=local_files_only, + subfolder=subfolder, ) model_save_dir = Path(model_cache_path).parent @@ -174,19 +177,16 @@ def _from_pretrained( model_class = _get_model_class(config, cls.auto_model_class._model_mapping) model = model_class.from_pretrained(model_save_dir) - return cls( - model, - config=config, - model_save_dir=model_save_dir, - use_cache=use_cache, - **kwargs, - ) - - def _save_pretrained(self, save_directory: Union[str, Path], file_name: Optional[str] = None, **kwargs): - if getattr(self.config, "torchscript", False): - torch.jit.save(self.model, os.path.join(save_directory, WEIGHTS_NAME)) + return cls(model, config=config, model_save_dir=model_save_dir, use_cache=use_cache, **kwargs) + + def _save_pretrained(self, save_directory: Union[str, Path]): + output_path = os.path.join(save_directory, WEIGHTS_NAME) + + if isinstance(self.model, torch.nn.Module): + state_dict = self.model.state_dict() + torch.save(state_dict, output_path) else: - torch.save(self.model, os.path.join(save_directory, WEIGHTS_NAME)) + torch.jit.save(self.model, output_path) def forward(self, *args, **kwargs): return self.model(*args, **kwargs)