diff --git a/notebooks/ipex/text_generation.ipynb b/notebooks/ipex/text_generation.ipynb index d1a62d9201..dd6b8c0abb 100644 --- a/notebooks/ipex/text_generation.ipynb +++ b/notebooks/ipex/text_generation.ipynb @@ -22,6 +22,7 @@ "source": [ "import torch\n", "from transformers import AutoTokenizer\n", + "\n", "from optimum.intel.ipex import IPEXModelForCausalLM" ] }, diff --git a/notebooks/openvino/optimum_openvino_inference.ipynb b/notebooks/openvino/optimum_openvino_inference.ipynb index 76c77aec55..5106fe1fba 100644 --- a/notebooks/openvino/optimum_openvino_inference.ipynb +++ b/notebooks/openvino/optimum_openvino_inference.ipynb @@ -78,6 +78,7 @@ "source": [ "from optimum.intel import OVModelForQuestionAnswering\n", "\n", + "\n", "# Load PyTorch model from the Hub and export to OpenVINO in the background\n", "model = OVModelForQuestionAnswering.from_pretrained(\"distilbert-base-uncased-distilled-squad\", export=True)\n", "\n", @@ -122,6 +123,7 @@ "source": [ "from transformers import AutoTokenizer\n", "\n", + "\n", "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased-distilled-squad\")\n", "tokenizer.save_pretrained(\"distilbert-base-uncased-distilled-squad-ov-fp32\")" ] @@ -182,9 +184,11 @@ } ], "source": [ - "from optimum.intel import OVModelForQuestionAnswering\n", "from transformers import AutoTokenizer, pipeline\n", "\n", + "from optimum.intel import OVModelForQuestionAnswering\n", + "\n", + "\n", "model = OVModelForQuestionAnswering.from_pretrained(\"distilbert-base-uncased-distilled-squad-ov-fp32\")\n", "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased-distilled-squad\")\n", "ov_pipe = pipeline(\"question-answering\", model=model, tokenizer=tokenizer)\n", @@ -240,9 +244,11 @@ ], "source": [ "import torch\n", - "from optimum.intel import OVModelForQuestionAnswering\n", "from transformers import AutoTokenizer, pipeline\n", "\n", + "from optimum.intel import OVModelForQuestionAnswering\n", + "\n", + "\n", "model = OVModelForQuestionAnswering.from_pretrained(\"distilbert-base-uncased-distilled-squad-ov-fp32\")\n", "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased-distilled-squad-ov-fp32\")\n", "\n", @@ -324,9 +330,11 @@ } ], "source": [ - "from optimum.intel import OVModelForQuestionAnswering\n", "from transformers import AutoTokenizer, pipeline\n", "\n", + "from optimum.intel import OVModelForQuestionAnswering\n", + "\n", + "\n", "model = OVModelForQuestionAnswering.from_pretrained(\n", " \"helenai/distilbert-base-uncased-distilled-squad-ov-fp32\", compile=False\n", ")\n", @@ -411,6 +419,7 @@ "source": [ "from openvino.runtime import Core\n", "\n", + "\n", "for device in Core().available_devices:\n", " print(device, Core().get_property(device, \"FULL_DEVICE_NAME\"))" ] @@ -528,10 +537,12 @@ } ], "source": [ + "from datasets import load_dataset\n", "from IPython.display import Audio\n", - "from optimum.intel import OVModelForAudioClassification\n", "from transformers import AutoFeatureExtractor, pipeline\n", - "from datasets import load_dataset\n", + "\n", + "from optimum.intel import OVModelForAudioClassification\n", + "\n", "\n", "model_id = \"helenai/MIT-ast-finetuned-speech-commands-v2-ov\"\n", "model = OVModelForAudioClassification.from_pretrained(model_id)\n", @@ -638,9 +649,11 @@ } ], "source": [ - "from optimum.intel import OVModelForCausalLM\n", "from transformers import AutoTokenizer, pipeline\n", "\n", + "from optimum.intel import OVModelForCausalLM\n", + "\n", + "\n", "model_id = \"helenai/gpt2-ov\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", "model = OVModelForCausalLM.from_pretrained(model_id)\n", @@ -704,9 +717,11 @@ ], "source": [ "from IPython.display import Image\n", - "from optimum.intel import OVModelForImageClassification\n", "from transformers import AutoImageProcessor, pipeline\n", "\n", + "from optimum.intel import OVModelForImageClassification\n", + "\n", + "\n", "model_id = \"helenai/microsoft-swin-tiny-patch4-window7-224-ov\"\n", "model = OVModelForImageClassification.from_pretrained(model_id, compile=False)\n", "image_processor = AutoImageProcessor.from_pretrained(model_id)\n", @@ -766,9 +781,11 @@ } ], "source": [ - "from optimum.intel import OVModelForMaskedLM\n", "from transformers import AutoTokenizer, pipeline\n", "\n", + "from optimum.intel import OVModelForMaskedLM\n", + "\n", + "\n", "model_id = \"helenai/bert-base-uncased-ov\"\n", "model = OVModelForMaskedLM.from_pretrained(model_id)\n", "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", @@ -835,9 +852,11 @@ } ], "source": [ - "from optimum.intel import OVModelForQuestionAnswering\n", "from transformers import AutoTokenizer, pipeline\n", "\n", + "from optimum.intel import OVModelForQuestionAnswering\n", + "\n", + "\n", "# Load the model and tokenizer saved in Part 1 of this notebook. Or use the line below to load them from the hub\n", "# model_id = \"helenai/distilbert-base-uncased-distilled-squad-ov-fp32\"\n", "model_id = \"distilbert-base-uncased-distilled-squad-ov-fp32\"\n", @@ -890,9 +909,11 @@ } ], "source": [ - "from optimum.intel import OVModelForSeq2SeqLM\n", "from transformers import AutoTokenizer, pipeline\n", "\n", + "from optimum.intel import OVModelForSeq2SeqLM\n", + "\n", + "\n", "model_id = \"helenai/t5-small-ov\"\n", "model = OVModelForSeq2SeqLM.from_pretrained(model_id, compile=False, trust_remote_code=True)\n", "tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)\n", @@ -998,9 +1019,11 @@ } ], "source": [ - "from optimum.intel import OVModelForSequenceClassification\n", "from transformers import AutoTokenizer, pipeline\n", "\n", + "from optimum.intel import OVModelForSequenceClassification\n", + "\n", + "\n", "model_id = \"helenai/papluca-xlm-roberta-base-language-detection-ov\"\n", "model = OVModelForSequenceClassification.from_pretrained(model_id)\n", "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", @@ -1047,9 +1070,11 @@ } ], "source": [ - "from optimum.intel import OVModelForTokenClassification\n", "from transformers import AutoTokenizer, pipeline\n", "\n", + "from optimum.intel import OVModelForTokenClassification\n", + "\n", + "\n", "model_id = \"helenai/dslim-bert-base-NER-ov-fp32\"\n", "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", "model = OVModelForTokenClassification.from_pretrained(model_id)\n", diff --git a/notebooks/openvino/quantized_generation_demo.ipynb b/notebooks/openvino/quantized_generation_demo.ipynb index 5673243cb2..c160e735b0 100644 --- a/notebooks/openvino/quantized_generation_demo.ipynb +++ b/notebooks/openvino/quantized_generation_demo.ipynb @@ -45,6 +45,7 @@ "import os\n", "\n", "from transformers import AutoTokenizer\n", + "\n", "from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig" ] }, @@ -211,6 +212,7 @@ "source": [ "from transformers import TextStreamer\n", "\n", + "\n", "# Tokenize the sample\n", "inputs = tokenizer([sample], return_tensors='pt')\n", "\n", @@ -294,7 +296,7 @@ "\n", "\n", "# Tokenize the sample\n", - "inputs = tokenizer([sample], return_tensors='pt') \n", + "inputs = tokenizer([sample], return_tensors='pt')\n", "\n", "out = stateless_model.generate(\n", " **inputs,\n", @@ -302,7 +304,7 @@ " streamer=TextStreamer(tokenizer=tokenizer, skip_special_tokens=True),\n", " pad_token_id=tokenizer.eos_token_id,\n", " prompt_lookup_num_tokens=3,\n", - ") " + ")" ] }, { @@ -442,6 +444,7 @@ "outputs": [], "source": [ "from functools import wraps\n", + "\n", "import numpy as np\n", "\n", "\n", @@ -458,15 +461,15 @@ " if len(self.seq_lens) > 0 or len(self.win_sizes) > 0:\n", " raise RuntimeError(\"Always use a new instance, don't reuse!\")\n", " self.model_forward = self.model.forward\n", - " \n", + "\n", " @wraps(self.model_forward)\n", " def forward_wrapper(**kwargs):\n", " self.seq_lens[-1].append(kwargs.get(\"attention_mask\").shape[-1])\n", " self.win_sizes[-1].append(kwargs.get(\"input_ids\").shape[-1] - 1)\n", " return self.model_forward(**kwargs)\n", - " \n", + "\n", " self.model.forward = forward_wrapper\n", - " \n", + "\n", " # wrap generate method\n", " self.model_generate = self.model.generate\n", "\n", @@ -494,7 +497,7 @@ " self.seq_lens = [sl[1:] for sl in self.seq_lens]\n", " # Add window size for output to ease calculation later\n", " for ws, sl in zip(self.win_sizes, self.seq_lens):\n", - " ws.append(0) \n", + " ws.append(0)\n", "\n", " def acceptance_rate(self, return_mean=True, normalize=False):\n", " # ar_per_win = ((cur_seq_len - cur_win_size) - (prev_seq_len - prev_win_size) - 1) / prev_win_size\n", @@ -533,8 +536,9 @@ "metadata": {}, "outputs": [], "source": [ - "from tqdm import tqdm\n", "from datasets import load_dataset\n", + "from tqdm import tqdm\n", + "\n", "\n", "dataset_name = \"openai_humaneval\"\n", "dataset_subset_name = None\n", @@ -590,10 +594,10 @@ "from threading import Thread\n", "\n", "from transformers import (\n", - " TextIteratorStreamer,\n", + " GenerationConfig,\n", " StoppingCriteria,\n", " StoppingCriteriaList,\n", - " GenerationConfig,\n", + " TextIteratorStreamer,\n", ")\n", "\n", "\n", @@ -690,7 +694,7 @@ " prompt_char = \"▌\"\n", " history[-1][1] = prompt_char\n", " yield history, \"Status: Generating...\", *([gr.update(interactive=False)] * 4)\n", - " \n", + "\n", " streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n", "\n", " # Create a stopping criteria to prevent the model from playing the role of the user aswell.\n", @@ -770,6 +774,7 @@ "source": [ "import gradio as gr\n", "\n", + "\n", "try:\n", " demo.close()\n", "except:\n", @@ -808,7 +813,7 @@ " history: conversation history\n", " Returns:\n", " updated history\n", - " \"\"\" \n", + " \"\"\"\n", " history[-1][1] = None\n", " return history\n", "\n", diff --git a/notebooks/openvino/question_answering_quantization.ipynb b/notebooks/openvino/question_answering_quantization.ipynb index 2481c9b904..247a6f868b 100644 --- a/notebooks/openvino/question_answering_quantization.ipynb +++ b/notebooks/openvino/question_answering_quantization.ipynb @@ -51,9 +51,11 @@ "import transformers\n", "from evaluate import evaluator\n", "from openvino.runtime import Core\n", - "from optimum.intel import OVModelForQuestionAnswering, OVQuantizer, OVQuantizationConfig, OVConfig\n", "from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline\n", "\n", + "from optimum.intel import OVConfig, OVModelForQuestionAnswering, OVQuantizationConfig, OVQuantizer\n", + "\n", + "\n", "transformers.logging.set_verbosity_error()\n", "datasets.logging.set_verbosity_error()" ] diff --git a/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb b/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb index 8ef2e8ad6c..798aede77a 100644 --- a/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb +++ b/notebooks/openvino/stable_diffusion_hybrid_quantization.ipynb @@ -46,15 +46,18 @@ "outputs": [], "source": [ "import time\n", + "from pathlib import Path\n", + "\n", "import datasets\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import transformers\n", - "from pathlib import Path\n", "from openvino.runtime import Core\n", + "\n", "from optimum.intel import OVConfig, OVQuantizer, OVStableDiffusionPipeline, OVWeightQuantizationConfig\n", "from optimum.intel.openvino.configuration import OVQuantizationMethod\n", "\n", + "\n", "transformers.logging.set_verbosity_error()\n", "datasets.logging.set_verbosity_error()" ] diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py index 484fd38077..2a9af1cd52 100644 --- a/optimum/exporters/ipex/model_patcher.py +++ b/optimum/exporters/ipex/model_patcher.py @@ -29,11 +29,11 @@ from .modeling_utils import ( _IPEX_MINIMUM_VERSION_FOR_PATCHING, _gpt2_block_forward, - _ipex_rms_layer_norm_forward, _IPEXFalconDecoderLayer, _IPEXGPT2Attention, _IPEXIntermediate, _IPEXLlamaDecoderLayer, + _llama_layer_norm_forward, _llama_model_forward, ) @@ -79,7 +79,7 @@ def _patch_llama_model(model): 2. Linear fusion with (2 Linears + Silu + Mul) and (Linear + Add) """ convert_functions(model, LlamaModel, "forward", _llama_model_forward) - convert_functions(model, LlamaRMSNorm, "forward", _ipex_rms_layer_norm_forward) + convert_functions(model, LlamaRMSNorm, "forward", _llama_layer_norm_forward) convert_class(model, LlamaDecoderLayer, _IPEXLlamaDecoderLayer, model.config) return model diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py index 3d28350b86..2e73fb9076 100644 --- a/optimum/exporters/ipex/modeling_utils.py +++ b/optimum/exporters/ipex/modeling_utils.py @@ -17,6 +17,7 @@ from typing import List, Optional, Tuple, Union import torch +from intel_extension_for_pytorch.llm.functional import rms_norm from torch import nn from torch.nn import functional as F from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask @@ -32,7 +33,6 @@ _IPEX_MINIMUM_VERSION_FOR_PATCHING = "2.3.0" - if is_ipex_version("<", _IPEX_MINIMUM_VERSION_FOR_PATCHING): logger.warning( f"Please upgrade the IPEX version to at least {_IPEX_MINIMUM_VERSION_FOR_PATCHING} if you want to patch the model." @@ -48,9 +48,48 @@ ) -# Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L83 -def _ipex_rms_layer_norm_forward(self, hidden_states): - return torch.ops.torch_ipex.rmsnorm(hidden_states, self.weight, self.variance_epsilon) +def matmul_add_add(attn_output, weight, bias=None, residual=None): + seq_len, bs, _ = attn_output.size() + if residual is None: + attn_output = torch.matmul(attn_output, weight) + if bias is not None: + attn_output += bias + else: + if bias is not None: + attn_output = torch.ops.torch_ipex.mm_bias_resadd(attn_output, weight, bias, 1.0, residual, 1.0) + else: + attn_output = torch.addmm( + residual.flatten(0, -2), + attn_output.flatten(0, -2), + weight, + beta=1.0, + ) + attn_output = attn_output.view(seq_len, bs, -1) + return attn_output + + +def padding_attn_mask(attn_mask, alignment): + if attn_mask is None: + return None + assert isinstance( + attn_mask, torch.Tensor + ), f"attn mask is supposed to be a tensor, instead we got {type(attn_mask)}" + if attn_mask.device == torch.device("cpu"): + return attn_mask + last_dim_size = attn_mask.size(-1) + aligned_size = (last_dim_size + alignment - 1) // alignment * alignment + mask_size = [*attn_mask.size()[:-1], aligned_size] + new_attn_mask = torch.empty(mask_size, dtype=attn_mask.dtype, device=attn_mask.device).fill_(-65504.0) + new_attn_mask[..., :last_dim_size] = attn_mask + return new_attn_mask + + +def _llama_layer_norm_forward(self, hidden_states): + if hidden_states.device.type == "xpu": + return rms_norm(hidden_states, self.weight, self.variance_epsilon) + else: + # Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L83 + return torch.ops.torch_ipex.rmsnorm(hidden_states, self.weight, self.variance_epsilon) # Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L1130 @@ -108,6 +147,8 @@ def _llama_model_forward( attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length ) + attention_mask = padding_attn_mask(attention_mask, 8) + # embed positions hidden_states = inputs_embeds @@ -115,12 +156,20 @@ def _llama_model_forward( all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None next_decoder_cache = () if use_cache else None - + if hidden_states.device.type == "xpu": + seqlen = hidden_states.size(1) + head_dim = self.layers[0].self_attn.head_dim + sin, cos = self.layers[0].self_attn.ipex_rope.get_sin_cos(seqlen, head_dim // 2) + sin = sin.squeeze()[position_ids].unsqueeze(2) + cos = cos.squeeze()[position_ids].unsqueeze(2) + decoder_layer_kwargs = {"sin": sin, "cos": cos} + else: + decoder_layer_kwargs = {} for idx, decoder_layer in enumerate(self.layers): if output_hidden_states: all_hidden_states += (hidden_states,) - past_key_value = past_key_values[idx] if past_key_values is not None else None + past_key_value = past_key_values[idx] if past_key_values is not None and len(past_key_values) > idx else None layer_outputs = decoder_layer( hidden_states, @@ -129,6 +178,7 @@ def _llama_model_forward( past_key_value=past_key_value, output_attentions=output_attentions, use_cache=use_cache, + **decoder_layer_kwargs, ) hidden_states = layer_outputs[0] @@ -174,14 +224,82 @@ def __init__(self, module, config) -> None: super().__init__() _setattr_from_module(self, module) self.config = config - self.ipex_scale_dot_product = IndirectAccessKVCacheAttention(text_max_length=config.max_position_embeddings) - if hasattr(config, "rope_theta"): - self.ipex_rope = RotaryEmbedding( - config.max_position_embeddings, - config.hidden_size // config.num_attention_heads, - config.rope_theta, - config.architectures[0], + self.module_device = next(module.parameters()).device.type + if self.module_device == "xpu": + from intel_extension_for_pytorch.transformers.models.xpu.fusions.mha_fusion import _IPEXRopeXPU + + self.ipex_rope = _IPEXRopeXPU( + module.config.max_position_embeddings, + module.config.hidden_size // module.config.num_attention_heads, + module.config.rope_theta, + module.config.architectures[0], + ) + self.port_parameters(module) + torch.xpu.empty_cache() + else: + self.ipex_scale_dot_product = IndirectAccessKVCacheAttention( + text_max_length=config.max_position_embeddings ) + if hasattr(config, "rope_theta"): + self.ipex_rope = RotaryEmbedding( + config.max_position_embeddings, + config.hidden_size // config.num_attention_heads, + config.rope_theta, + config.architectures[0], + ) + + def port_parameters(self, module): + self.qkv_proj_bias = None + self.qkv_proj_weight = None + if self.num_heads == self.num_key_value_heads: + q_proj = module.q_proj.weight.transpose(0, 1) + k_proj = module.k_proj.weight.transpose(0, 1) + v_proj = module.v_proj.weight.transpose(0, 1) + self.qkv_proj_weight = torch.stack([q_proj, k_proj, v_proj]).contiguous().view([3, -1, q_proj.shape[-1]]) + module.q_proj.weight.data = self.qkv_proj_weight[0, :, :].transpose(0, 1) + module.k_proj.weight.data = self.qkv_proj_weight[1, :, :].transpose(0, 1) + module.v_proj.weight.data = self.qkv_proj_weight[2, :, :].transpose(0, 1) + if module.q_proj.bias is not None: + self.qkv_proj_bias = ( + torch.stack([module.q_proj.bias, module.k_proj.bias, module.v_proj.bias]) + .contiguous() + .view([3, -1]) + ) + module.q_proj.bias.data = self.qkv_proj_bias[0] + module.k_proj.bias.data = self.qkv_proj_bias[1] + module.v_proj.bias.data = self.qkv_proj_bias[2] + else: + q_proj = module.q_proj.weight.view( + self.num_key_value_heads, self.num_key_value_groups, self.head_dim, self.hidden_size + ) + k_proj = module.k_proj.weight.view(self.num_key_value_heads, 1, self.head_dim, self.hidden_size) + v_proj = module.v_proj.weight.view(self.num_key_value_heads, 1, self.head_dim, self.hidden_size) + self.qkv_proj_weight = torch.cat([q_proj, k_proj, v_proj], dim=1).view( + [self.num_key_value_heads, self.num_key_value_groups + 2, self.head_dim, self.hidden_size] + ) + module.q_proj.data = self.qkv_proj_weight[:, : self.num_key_value_groups, :, :].reshape( + [self.num_key_value_heads * self.num_key_value_groups * self.head_dim, self.hidden_size] + ) + module.k_proj.data = self.qkv_proj_weight[:, self.num_key_value_groups, :, :].reshape( + [self.num_key_value_heads * self.head_dim, self.hidden_size] + ) + module.v_proj.data = self.qkv_proj_weight[:, self.num_key_value_groups + 1, :, :].reshape( + [self.num_key_value_heads * self.head_dim, self.hidden_size] + ) + self.qkv_proj_weight = self.qkv_proj_weight.permute(3, 0, 1, 2).contiguous() + if module.q_proj.bias is not None: + q_bias = module.q_proj.bias.view(self.num_key_value_heads, self.num_key_value_groups, self.head_dim) + k_bias = module.k_proj.bias.view(self.num_key_value_heads, 1, self.head_dim) + v_bias = module.v_proj.bias.view(self.num_key_value_heads, 1, self.head_dim) + self.qkv_proj_bias = torch.cat([q_bias, k_bias, v_bias], dim=1).view( + [self.num_key_value_heads, self.num_key_value_groups + 2, self.head_dim] + ) + module.q_proj.bias.data = self.qkv_proj_bias[:, : self.num_key_value_groups, self.head_dim].view(-1) + module.k_proj.bias.data = self.qkv_proj_bias[:, self.num_key_value_groups, self.head_dim].view(-1) + module.v_proj.bias.data = self.qkv_proj_bias[:, self.num_key_value_groups + 1, self.head_dim].view(-1) + self.o_proj_weight = module.o_proj.weight.transpose(0, 1).contiguous() + module.o_proj.weight.data = self.o_proj_weight.transpose(0, 1) + self.o_proj_bias = module.o_proj.bias def qkv_gemm(self, hidden_states): raise NotImplementedError("Need to implement in specific model class") @@ -192,16 +310,25 @@ def rope(self, *args, **kwargs): def sdpa_with_cache(self, query, key, value, past_key_value, attention_mask, **kwargs): # This ipex op pre-allocates buffers for past_key_values and use beam index history # which to decide which beam should be used to make attention scale dot more efficient. - (attn_output, attn_weights, past_key_value) = self.ipex_scale_dot_product( - query, - key, - value, - math.sqrt(self.head_dim), - past_key_value, - kwargs.get("head_mask", None), - attention_mask, - kwargs.get("alibi", None), - ) + if self.module_device == "xpu": + scale = 1.0 / math.sqrt(self.head_dim) + is_causal = False + attn_output = torch.xpu.IpexSDP( + query, key, value, None, attention_mask, None, scale, 1.0, 0.0, is_causal, False + ) + attn_weights = None + past_key_value = (key, value) + else: + (attn_output, attn_weights, past_key_value) = self.ipex_scale_dot_product( + query, + key, + value, + math.sqrt(self.head_dim), + past_key_value, + kwargs.get("head_mask", None), + attention_mask, + kwargs.get("alibi", None), + ) return attn_output, past_key_value, attn_weights def sdpa_without_cache(self, query, key, value, past_key_value, attention_mask, **kwargs): @@ -235,10 +362,18 @@ def forward( qkv_out = self.qkv_gemm(hidden_states) if isinstance(qkv_out, tuple) and len(qkv_out) == 3: query, key, value = self.qkv_gemm(hidden_states) - query, key = self.rope(query, key, kv_seq_len, use_cache, position_ids=position_ids) + query, key = self.rope(query, key, kv_seq_len, use_cache, position_ids, **kwargs) else: query, key, value = self.rope(qkv_out, kv_seq_len, use_cache, past_len=past_len) + if self.module_device == "xpu": + if past_key_value is not None: + key = torch.cat([past_key_value[0].transpose(1, 2), key], dim=1) + value = torch.cat([past_key_value[1].transpose(1, 2), value], dim=1) + query = query.transpose(1, 2) + key = key.transpose(1, 2) + value = value.transpose(1, 2) + attention_mask = self.prepare_attention_mask_float(attention_mask, query.dtype) sdpa = self.sdpa_with_cache if use_cache else self.sdpa_without_cache attn_output, past_key_value, attn_weights = sdpa( @@ -251,6 +386,7 @@ def forward( head_mask=kwargs.get("head_mask", None), alibi=kwargs.get("alibi", None), ) + attn_output = self.postprocess_attention_output(attn_output, bsz, seq_len) if not output_attentions: @@ -262,9 +398,10 @@ def forward( class _IPEXLlamaAttention(_IPEXAttention): def __init__(self, module, config) -> None: super().__init__(module, config) - if module.o_proj.__class__.__name__ not in ["LinearAllreduce"]: - self.mha_linear_add = LinearAdd(module.o_proj) - del self.__dict__["_modules"]["o_proj"] + if self.module_device == "cpu": + if module.o_proj.__class__.__name__ not in ["LinearAllreduce"]: + self.mha_linear_add = LinearAdd(module.o_proj) + del self.__dict__["_modules"]["o_proj"] def qkv_gemm(self, hidden_states): bsz, seq_len, _ = hidden_states.size() @@ -274,11 +411,16 @@ def qkv_gemm(self, hidden_states): return query, key, value - def rope(self, query, key, kv_seq_len, use_cache, position_ids): - if use_cache: - args = (self.head_dim, self.head_dim // 2, self.head_dim, kv_seq_len) - key = self.ipex_rope(key, position_ids, self.num_key_value_heads, *args) - query = self.ipex_rope(query, position_ids, self.num_heads, *args) + def rope(self, query, key, kv_seq_len, use_cache, position_ids, **kwargs): + if self.module_device == "xpu": + sin = kwargs.pop("sin", None) + cos = kwargs.pop("cos", None) + self.ipex_rope.apply_embedding(query, sin, cos, self.head_dim // 2, key) + else: + if use_cache: + args = (self.head_dim, self.head_dim // 2, self.head_dim, kv_seq_len) + key = self.ipex_rope(key, position_ids, self.num_key_value_heads, *args) + query = self.ipex_rope(query, position_ids, self.num_heads, *args) return query, key # Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L341 @@ -372,28 +514,52 @@ def __init__(self, module, config) -> None: super().__init__() _setattr_from_module(self, module) self.config = config - # LinearAllreduce and LinearLayer cannot use fused op LinearAdd - if module.down_proj.__class__.__name__ not in ["LinearAllreduce"]: - self.mlp_linear_add = LinearAdd(module.down_proj) - del self.__dict__["_modules"]["down_proj"] - self.linear_silu_mul = Linear2SiluMul(module.gate_proj, module.up_proj) - del self.__dict__["_modules"]["gate_proj"] - del self.__dict__["_modules"]["up_proj"] + self.module_device = next(module.parameters()).device.type + if self.module_device == "xpu": + self.port_parameter(module) + torch.xpu.empty_cache() + else: + # LinearAllreduce and LinearLayer cannot use fused op LinearAdd + if module.down_proj.__class__.__name__ not in ["LinearAllreduce"]: + self.mlp_linear_add = LinearAdd(module.down_proj) + del self.__dict__["_modules"]["down_proj"] + self.linear_silu_mul = Linear2SiluMul(module.gate_proj, module.up_proj) + del self.__dict__["_modules"]["gate_proj"] + del self.__dict__["_modules"]["up_proj"] def forward(self, hidden_states: torch.Tensor, residual: torch.Tensor = None, **kwargs): - if hasattr(self, "linear_silu_mul"): - mlp_gate = self.linear_silu_mul(hidden_states) - if hasattr(self, "mlp_linear_add"): - hidden_states = self.mlp_linear_add(mlp_gate, residual) + if self.module_device == "xpu": + up = torch.ops.torch_ipex.mm_silu(hidden_states, self.gate_proj_weight) + hidden_states = torch.ops.torch_ipex.mm_resmul(hidden_states, self.up_proj_weight, up) + hidden_states = matmul_add_add(hidden_states, self.down_proj_weight, self.down_proj_bias, residual) + else: + if hasattr(self, "linear_silu_mul"): + mlp_gate = self.linear_silu_mul(hidden_states) + if hasattr(self, "mlp_linear_add"): + hidden_states = self.mlp_linear_add(mlp_gate, residual) + else: + hidden_states = self.down_proj( + self.act_fn(self.gate_proj(hidden_states)) * self.up_proj(hidden_states) + ) + hidden_states = residual + hidden_states else: - hidden_states = self.down_proj(mlp_gate) + hidden_states = self.down_proj( + self.act_fn(self.gate_proj(hidden_states)) * self.up_proj(hidden_states) + ) hidden_states = residual + hidden_states - else: - hidden_states = self.down_proj(self.act_fn(self.gate_proj(hidden_states)) * self.up_proj(hidden_states)) - hidden_states = residual + hidden_states - return hidden_states + def port_parameter(self, module): + self.up_proj_weight = module.up_proj.weight.transpose(0, 1).contiguous() + module.up_proj.weight.data = self.up_proj_weight.transpose(0, 1) + self.gate_proj_weight = module.gate_proj.weight.transpose(0, 1).contiguous() + module.gate_proj.weight.data = self.gate_proj_weight.transpose(0, 1) + self.down_proj_weight = module.down_proj.weight.transpose(0, 1).contiguous() + module.down_proj.weight.data = self.down_proj_weight.transpose(0, 1) + self.up_proj_bias = module.up_proj.bias + self.gate_proj_bias = module.gate_proj.bias + self.down_proj_bias = module.down_proj.bias + class _IPEXFalconMLP(nn.Module): def __init__(self, module, config) -> None: diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 59d4bedb51..57185de6c1 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -404,9 +404,9 @@ def _llama_gemma_update_causal_mask_legacy(self, attention_mask, input_tensor, c offset = 0 mask_shape = attention_mask.shape mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype - causal_mask[ - : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] - ] = mask_slice + causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = ( + mask_slice + ) if ( self.config._attn_implementation == "sdpa" @@ -1966,9 +1966,9 @@ def _dbrx_update_causal_mask_legacy( offset = 0 mask_shape = attention_mask.shape mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype - causal_mask[ - : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] - ] = mask_slice + causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = ( + mask_slice + ) if ( self.config._attn_implementation == "sdpa" diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index d6467f76a2..b2be8a6b1d 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -154,7 +154,7 @@ def __init__( self._device = torch.device("cpu") # CPU only support jit model for now. - if export: + if export and self._device.type == "cpu": if isinstance(model, torch.jit.RecursiveScriptModule): logger.warning("The model has been exported already.") else: @@ -266,7 +266,7 @@ def _from_pretrained( logger.warning("Detect torchscript is false. Convert to torchscript model!") if is_torch_version("<", "2.1.0"): - raise ImportError("`torch>=2.0.0` is needed to trace your model") + raise ImportError("`torch>=2.1.0` is needed to trace your model") task = cls.export_feature config.torch_dtype = torch_dtype @@ -279,6 +279,15 @@ def _from_pretrained( _commit_hash=commit_hash, **model_kwargs, ) + if is_torch_xpu_available(check_device=True): + model.to("xpu:0") + if _is_patched_with_ipex(model, task): + model = _patch_model(model) + else: + use_cache = kwargs.get("use_cache", True) + model = ipex_jit_trace(model, task, use_cache) + config.torchscript = True + config.torch_dtype = torch_dtype return cls(model, config=config, export=True, **kwargs) @@ -504,7 +513,7 @@ def __init__( except AttributeError: self.model_cls = get_model_class(self.config, AutoModelForCausalLM._model_mapping) - if self._is_ipex_exported: + if self._is_ipex_exported and self._device.type == "cpu": self._reorder_cache = _ipex_reorder_cache else: # Check if _reorder_cache is a static method @@ -621,7 +630,7 @@ def forward( inputs["position_ids"] = position_ids if self.use_cache: - if past_key_values is None: + if past_key_values is None and self._device.type == "cpu": past_key_values = self._prepare_past_key_values(input_ids) inputs["past_key_values"] = past_key_values diff --git a/optimum/intel/openvino/modeling_base.py b/optimum/intel/openvino/modeling_base.py index 89da349c82..8944ef6da2 100644 --- a/optimum/intel/openvino/modeling_base.py +++ b/optimum/intel/openvino/modeling_base.py @@ -84,9 +84,9 @@ def __init__( for idx, key in enumerate(model.inputs): names = tuple(key.get_names()) input_names[next((name for name in names if "/" not in name), names[0])] = idx - input_dtypes[ - next((name for name in names if "/" not in name), names[0]) - ] = key.get_element_type().get_type_name() + input_dtypes[next((name for name in names if "/" not in name), names[0])] = ( + key.get_element_type().get_type_name() + ) self.input_names = input_names self.input_dtypes = input_dtypes @@ -95,9 +95,9 @@ def __init__( for idx, key in enumerate(model.outputs): names = tuple(key.get_names()) output_names[next((name for name in names if "/" not in name), names[0])] = idx - output_dtypes[ - next((name for name in names if "/" not in name), names[0]) - ] = key.get_element_type().get_type_name() + output_dtypes[next((name for name in names if "/" not in name), names[0])] = ( + key.get_element_type().get_type_name() + ) self.output_names = output_names self.output_dtypes = output_dtypes