Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add IPEX-XPU support for Llama2 model Inference #703

Closed
wants to merge 38 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
5c4d13f
add xpu patch to optimum intel (#7)
ganyi1996ppo Apr 22, 2024
b1d6989
can run but precision error
jiqing-feng Apr 25, 2024
f2de914
optimize optimum
ganyi1996ppo Apr 26, 2024
9295457
further optimize
ganyi1996ppo Apr 26, 2024
c55216a
finalize
faaany May 8, 2024
5b3b72d
fix version
faaany May 9, 2024
4897144
fix ipex version check
faaany May 11, 2024
5351f4a
ipex 2.3 released
jiqing-feng May 23, 2024
6289b57
change versions
faaany May 24, 2024
3824300
debug beam search
faaany May 24, 2024
872a3eb
remove reference elimination
faaany May 24, 2024
d1d0ca0
refactor IPEXLlamaAttention
faaany May 25, 2024
3b8900d
Merge branch 'ipex-cpu' into ipex-xpu
faaany May 26, 2024
815d238
Merge branch 'huggingface:main' into ipex-xpu
faaany May 26, 2024
89e10d6
add xpu port
faaany May 26, 2024
9acaba4
Fix llama and gemma modeling patching for openvino export (#714)
echarlaix May 23, 2024
2f4909c
Fix nncf quantization for decoder models (#727)
echarlaix May 24, 2024
17d02d3
Merge branch 'ipex-xpu' of https://github.com/faaany/optimum-intel in…
faaany May 26, 2024
f186ce7
remove
faaany May 26, 2024
1ff78b2
fix version
faaany May 26, 2024
ff7f785
bug fix
faaany May 26, 2024
e3dac89
change module
faaany May 26, 2024
8725f49
improve device
faaany May 26, 2024
57cfe11
remove
faaany May 26, 2024
ee78f95
simplfy rmsnorm
faaany May 27, 2024
a930f31
Merge branch 'ipex-xpu' of https://github.com/faaany/optimum-intel in…
faaany May 27, 2024
6098943
style
faaany May 27, 2024
e0fb06e
fix group attention
faaany Jun 7, 2024
aa8d395
fix weight shape
faaany Jun 7, 2024
0a56b19
Merge branch 'main' into ipex-xpu
faaany Jun 7, 2024
548d83f
fix rebase bug
faaany Jun 7, 2024
68187e5
revert openvino
faaany Jun 7, 2024
efedca4
revert openvino
faaany Jun 7, 2024
bd03552
remove duplicates
faaany Jun 7, 2024
0d3930a
use the correct black
faaany Jun 7, 2024
b4ba6d0
Merge branch 'main' into ipex-xpu
faaany Sep 6, 2024
1fd464b
fix merge conflict
kaixuanliu Sep 10, 2024
6a52fdf
Merge pull request #1 from kaixuanliu/ipex-xpu
kaixuanliu Sep 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions notebooks/ipex/text_generation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"source": [
"import torch\n",
"from transformers import AutoTokenizer\n",
"\n",
"from optimum.intel.ipex import IPEXModelForCausalLM"
]
},
Expand Down
49 changes: 37 additions & 12 deletions notebooks/openvino/optimum_openvino_inference.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
"source": [
"from optimum.intel import OVModelForQuestionAnswering\n",
"\n",
"\n",
"# Load PyTorch model from the Hub and export to OpenVINO in the background\n",
"model = OVModelForQuestionAnswering.from_pretrained(\"distilbert-base-uncased-distilled-squad\", export=True)\n",
"\n",
Expand Down Expand Up @@ -122,6 +123,7 @@
"source": [
"from transformers import AutoTokenizer\n",
"\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased-distilled-squad\")\n",
"tokenizer.save_pretrained(\"distilbert-base-uncased-distilled-squad-ov-fp32\")"
]
Expand Down Expand Up @@ -182,9 +184,11 @@
}
],
"source": [
"from optimum.intel import OVModelForQuestionAnswering\n",
"from transformers import AutoTokenizer, pipeline\n",
"\n",
"from optimum.intel import OVModelForQuestionAnswering\n",
"\n",
"\n",
"model = OVModelForQuestionAnswering.from_pretrained(\"distilbert-base-uncased-distilled-squad-ov-fp32\")\n",
"tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased-distilled-squad\")\n",
"ov_pipe = pipeline(\"question-answering\", model=model, tokenizer=tokenizer)\n",
Expand Down Expand Up @@ -240,9 +244,11 @@
],
"source": [
"import torch\n",
"from optimum.intel import OVModelForQuestionAnswering\n",
"from transformers import AutoTokenizer, pipeline\n",
"\n",
"from optimum.intel import OVModelForQuestionAnswering\n",
"\n",
"\n",
"model = OVModelForQuestionAnswering.from_pretrained(\"distilbert-base-uncased-distilled-squad-ov-fp32\")\n",
"tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased-distilled-squad-ov-fp32\")\n",
"\n",
Expand Down Expand Up @@ -324,9 +330,11 @@
}
],
"source": [
"from optimum.intel import OVModelForQuestionAnswering\n",
"from transformers import AutoTokenizer, pipeline\n",
"\n",
"from optimum.intel import OVModelForQuestionAnswering\n",
"\n",
"\n",
"model = OVModelForQuestionAnswering.from_pretrained(\n",
" \"helenai/distilbert-base-uncased-distilled-squad-ov-fp32\", compile=False\n",
")\n",
Expand Down Expand Up @@ -411,6 +419,7 @@
"source": [
"from openvino.runtime import Core\n",
"\n",
"\n",
"for device in Core().available_devices:\n",
" print(device, Core().get_property(device, \"FULL_DEVICE_NAME\"))"
]
Expand Down Expand Up @@ -528,10 +537,12 @@
}
],
"source": [
"from datasets import load_dataset\n",
"from IPython.display import Audio\n",
"from optimum.intel import OVModelForAudioClassification\n",
"from transformers import AutoFeatureExtractor, pipeline\n",
"from datasets import load_dataset\n",
"\n",
"from optimum.intel import OVModelForAudioClassification\n",
"\n",
"\n",
"model_id = \"helenai/MIT-ast-finetuned-speech-commands-v2-ov\"\n",
"model = OVModelForAudioClassification.from_pretrained(model_id)\n",
Expand Down Expand Up @@ -638,9 +649,11 @@
}
],
"source": [
"from optimum.intel import OVModelForCausalLM\n",
"from transformers import AutoTokenizer, pipeline\n",
"\n",
"from optimum.intel import OVModelForCausalLM\n",
"\n",
"\n",
"model_id = \"helenai/gpt2-ov\"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
"model = OVModelForCausalLM.from_pretrained(model_id)\n",
Expand Down Expand Up @@ -704,9 +717,11 @@
],
"source": [
"from IPython.display import Image\n",
"from optimum.intel import OVModelForImageClassification\n",
"from transformers import AutoImageProcessor, pipeline\n",
"\n",
"from optimum.intel import OVModelForImageClassification\n",
"\n",
"\n",
"model_id = \"helenai/microsoft-swin-tiny-patch4-window7-224-ov\"\n",
"model = OVModelForImageClassification.from_pretrained(model_id, compile=False)\n",
"image_processor = AutoImageProcessor.from_pretrained(model_id)\n",
Expand Down Expand Up @@ -766,9 +781,11 @@
}
],
"source": [
"from optimum.intel import OVModelForMaskedLM\n",
"from transformers import AutoTokenizer, pipeline\n",
"\n",
"from optimum.intel import OVModelForMaskedLM\n",
"\n",
"\n",
"model_id = \"helenai/bert-base-uncased-ov\"\n",
"model = OVModelForMaskedLM.from_pretrained(model_id)\n",
"tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
Expand Down Expand Up @@ -835,9 +852,11 @@
}
],
"source": [
"from optimum.intel import OVModelForQuestionAnswering\n",
"from transformers import AutoTokenizer, pipeline\n",
"\n",
"from optimum.intel import OVModelForQuestionAnswering\n",
"\n",
"\n",
"# Load the model and tokenizer saved in Part 1 of this notebook. Or use the line below to load them from the hub\n",
"# model_id = \"helenai/distilbert-base-uncased-distilled-squad-ov-fp32\"\n",
"model_id = \"distilbert-base-uncased-distilled-squad-ov-fp32\"\n",
Expand Down Expand Up @@ -890,9 +909,11 @@
}
],
"source": [
"from optimum.intel import OVModelForSeq2SeqLM\n",
"from transformers import AutoTokenizer, pipeline\n",
"\n",
"from optimum.intel import OVModelForSeq2SeqLM\n",
"\n",
"\n",
"model_id = \"helenai/t5-small-ov\"\n",
"model = OVModelForSeq2SeqLM.from_pretrained(model_id, compile=False, trust_remote_code=True)\n",
"tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)\n",
Expand Down Expand Up @@ -998,9 +1019,11 @@
}
],
"source": [
"from optimum.intel import OVModelForSequenceClassification\n",
"from transformers import AutoTokenizer, pipeline\n",
"\n",
"from optimum.intel import OVModelForSequenceClassification\n",
"\n",
"\n",
"model_id = \"helenai/papluca-xlm-roberta-base-language-detection-ov\"\n",
"model = OVModelForSequenceClassification.from_pretrained(model_id)\n",
"tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
Expand Down Expand Up @@ -1047,9 +1070,11 @@
}
],
"source": [
"from optimum.intel import OVModelForTokenClassification\n",
"from transformers import AutoTokenizer, pipeline\n",
"\n",
"from optimum.intel import OVModelForTokenClassification\n",
"\n",
"\n",
"model_id = \"helenai/dslim-bert-base-NER-ov-fp32\"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
"model = OVModelForTokenClassification.from_pretrained(model_id)\n",
Expand Down
27 changes: 16 additions & 11 deletions notebooks/openvino/quantized_generation_demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
"import os\n",
"\n",
"from transformers import AutoTokenizer\n",
"\n",
"from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig"
]
},
Expand Down Expand Up @@ -211,6 +212,7 @@
"source": [
"from transformers import TextStreamer\n",
"\n",
"\n",
"# Tokenize the sample\n",
"inputs = tokenizer([sample], return_tensors='pt')\n",
"\n",
Expand Down Expand Up @@ -294,15 +296,15 @@
"\n",
"\n",
"# Tokenize the sample\n",
"inputs = tokenizer([sample], return_tensors='pt') \n",
"inputs = tokenizer([sample], return_tensors='pt')\n",
"\n",
"out = stateless_model.generate(\n",
" **inputs,\n",
" max_new_tokens=128,\n",
" streamer=TextStreamer(tokenizer=tokenizer, skip_special_tokens=True),\n",
" pad_token_id=tokenizer.eos_token_id,\n",
" prompt_lookup_num_tokens=3,\n",
") "
")"
]
},
{
Expand Down Expand Up @@ -442,6 +444,7 @@
"outputs": [],
"source": [
"from functools import wraps\n",
"\n",
"import numpy as np\n",
"\n",
"\n",
Expand All @@ -458,15 +461,15 @@
" if len(self.seq_lens) > 0 or len(self.win_sizes) > 0:\n",
" raise RuntimeError(\"Always use a new instance, don't reuse!\")\n",
" self.model_forward = self.model.forward\n",
" \n",
"\n",
" @wraps(self.model_forward)\n",
" def forward_wrapper(**kwargs):\n",
" self.seq_lens[-1].append(kwargs.get(\"attention_mask\").shape[-1])\n",
" self.win_sizes[-1].append(kwargs.get(\"input_ids\").shape[-1] - 1)\n",
" return self.model_forward(**kwargs)\n",
" \n",
"\n",
" self.model.forward = forward_wrapper\n",
" \n",
"\n",
" # wrap generate method\n",
" self.model_generate = self.model.generate\n",
"\n",
Expand Down Expand Up @@ -494,7 +497,7 @@
" self.seq_lens = [sl[1:] for sl in self.seq_lens]\n",
" # Add window size for output to ease calculation later\n",
" for ws, sl in zip(self.win_sizes, self.seq_lens):\n",
" ws.append(0) \n",
" ws.append(0)\n",
"\n",
" def acceptance_rate(self, return_mean=True, normalize=False):\n",
" # ar_per_win = ((cur_seq_len - cur_win_size) - (prev_seq_len - prev_win_size) - 1) / prev_win_size\n",
Expand Down Expand Up @@ -533,8 +536,9 @@
"metadata": {},
"outputs": [],
"source": [
"from tqdm import tqdm\n",
"from datasets import load_dataset\n",
"from tqdm import tqdm\n",
"\n",
"\n",
"dataset_name = \"openai_humaneval\"\n",
"dataset_subset_name = None\n",
Expand Down Expand Up @@ -590,10 +594,10 @@
"from threading import Thread\n",
"\n",
"from transformers import (\n",
" TextIteratorStreamer,\n",
" GenerationConfig,\n",
" StoppingCriteria,\n",
" StoppingCriteriaList,\n",
" GenerationConfig,\n",
" TextIteratorStreamer,\n",
")\n",
"\n",
"\n",
Expand Down Expand Up @@ -690,7 +694,7 @@
" prompt_char = \"▌\"\n",
" history[-1][1] = prompt_char\n",
" yield history, \"Status: Generating...\", *([gr.update(interactive=False)] * 4)\n",
" \n",
"\n",
" streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)\n",
"\n",
" # Create a stopping criteria to prevent the model from playing the role of the user aswell.\n",
Expand Down Expand Up @@ -770,6 +774,7 @@
"source": [
"import gradio as gr\n",
"\n",
"\n",
"try:\n",
" demo.close()\n",
"except:\n",
Expand Down Expand Up @@ -808,7 +813,7 @@
" history: conversation history\n",
" Returns:\n",
" updated history\n",
" \"\"\" \n",
" \"\"\"\n",
" history[-1][1] = None\n",
" return history\n",
"\n",
Expand Down
4 changes: 3 additions & 1 deletion notebooks/openvino/question_answering_quantization.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,11 @@
"import transformers\n",
"from evaluate import evaluator\n",
"from openvino.runtime import Core\n",
"from optimum.intel import OVModelForQuestionAnswering, OVQuantizer, OVQuantizationConfig, OVConfig\n",
"from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline\n",
"\n",
"from optimum.intel import OVConfig, OVModelForQuestionAnswering, OVQuantizationConfig, OVQuantizer\n",
"\n",
"\n",
"transformers.logging.set_verbosity_error()\n",
"datasets.logging.set_verbosity_error()"
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,18 @@
"outputs": [],
"source": [
"import time\n",
"from pathlib import Path\n",
"\n",
"import datasets\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import transformers\n",
"from pathlib import Path\n",
"from openvino.runtime import Core\n",
"\n",
"from optimum.intel import OVConfig, OVQuantizer, OVStableDiffusionPipeline, OVWeightQuantizationConfig\n",
"from optimum.intel.openvino.configuration import OVQuantizationMethod\n",
"\n",
"\n",
"transformers.logging.set_verbosity_error()\n",
"datasets.logging.set_verbosity_error()"
]
Expand Down
4 changes: 2 additions & 2 deletions optimum/exporters/ipex/model_patcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@
from .modeling_utils import (
_IPEX_MINIMUM_VERSION_FOR_PATCHING,
_gpt2_block_forward,
_ipex_rms_layer_norm_forward,
_IPEXFalconDecoderLayer,
_IPEXGPT2Attention,
_IPEXIntermediate,
_IPEXLlamaDecoderLayer,
_llama_layer_norm_forward,
_llama_model_forward,
)

Expand Down Expand Up @@ -79,7 +79,7 @@ def _patch_llama_model(model):
2. Linear fusion with (2 Linears + Silu + Mul) and (Linear + Add)
"""
convert_functions(model, LlamaModel, "forward", _llama_model_forward)
convert_functions(model, LlamaRMSNorm, "forward", _ipex_rms_layer_norm_forward)
convert_functions(model, LlamaRMSNorm, "forward", _llama_layer_norm_forward)
convert_class(model, LlamaDecoderLayer, _IPEXLlamaDecoderLayer, model.config)
return model

Expand Down
Loading