From 4871bbad25c1962f2d2a2ff9c4bb5643b51fca6f Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Mon, 11 Mar 2024 12:24:52 -0400 Subject: [PATCH 01/14] add ipex readme --- README.md | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/README.md b/README.md index 7905cefded..ed4b60d1b6 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,34 @@ where `extras` can be one or more of `ipex`, `neural-compressor`, `openvino`, `n # Quick tour +## IPEX +Below are examples of how to use IPEX model to generate texts. +### generate +```diff +import torch +from transformers import AutoTokenizer, AutoConfig +- from transformers import AutoModelForCausalLM ++ from optimum.intel.ipex import IPEXModelForCausalLM + +config = AutoConfig.from_pretrained("gpt2") +model = IPEXModelForCausalLM.from_pretrained( + "gpt2", + config=config, + torch_dtype=torch.bfloat16, + export=True, +) +tokenizer = AutoTokenizer.from_pretrained("gpt2") +input_sentence = ["Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?"] +model_inputs = tokenizer(input_sentence, return_tensors="pt") +generation_kwargs = dict(max_new_tokens=32, do_sample=False, num_beams=4, num_beam_groups=1, no_repeat_ngram_size=2, use_cache=True) + +generated_ids = model.generate(**model_inputs, **generation_kwargs) +output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] +print(output) +``` + +For more details, please refer to the [documentation](https://intel.github.io/intel-extension-for-pytorch/#introduction). + ## Neural Compressor Dynamic quantization can be used through the Optimum command-line interface: From fe01c5704bed5771f62598ebb466d51ddf3a4011 Mon Sep 17 00:00:00 2001 From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com> Date: Wed, 20 Mar 2024 10:31:53 +0800 Subject: [PATCH 02/14] Update README.md Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ed4b60d1b6..142ea55d1e 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,8 @@ from transformers import AutoTokenizer, AutoConfig + from optimum.intel.ipex import IPEXModelForCausalLM config = AutoConfig.from_pretrained("gpt2") -model = IPEXModelForCausalLM.from_pretrained( +- model = AutoModelForCausalLM.from_pretrained( ++ model = IPEXModelForCausalLM.from_pretrained( "gpt2", config=config, torch_dtype=torch.bfloat16, From 8a49f9c42e8ae9efa54fa70fd722e768cc30b39c Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Wed, 20 Mar 2024 06:32:13 -0400 Subject: [PATCH 03/14] fix readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 142ea55d1e..a61f4afb84 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ where `extras` can be one or more of `ipex`, `neural-compressor`, `openvino`, `n # Quick tour ## IPEX -Below are examples of how to use IPEX model to generate texts. +Here is the example of how to use IPEX optimized model to generate texts. ### generate ```diff import torch From 7570d715d1cf773e15d6a0d17d100d27e010a10e Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Wed, 20 Mar 2024 09:24:41 -0400 Subject: [PATCH 04/14] fix readme --- README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index a61f4afb84..ff70c50169 100644 --- a/README.md +++ b/README.md @@ -49,15 +49,13 @@ Here is the example of how to use IPEX optimized model to generate texts. ### generate ```diff import torch -from transformers import AutoTokenizer, AutoConfig +from transformers import AutoTokenizer - from transformers import AutoModelForCausalLM + from optimum.intel.ipex import IPEXModelForCausalLM -config = AutoConfig.from_pretrained("gpt2") - model = AutoModelForCausalLM.from_pretrained( + model = IPEXModelForCausalLM.from_pretrained( "gpt2", - config=config, torch_dtype=torch.bfloat16, export=True, ) From 0d312f10a991b64179c9cd1285c0df1685f3f03d Mon Sep 17 00:00:00 2001 From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com> Date: Thu, 21 Mar 2024 14:03:53 +0800 Subject: [PATCH 05/14] Update README.md Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- README.md | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index ff70c50169..e9838f2d1d 100644 --- a/README.md +++ b/README.md @@ -53,12 +53,10 @@ from transformers import AutoTokenizer - from transformers import AutoModelForCausalLM + from optimum.intel.ipex import IPEXModelForCausalLM -- model = AutoModelForCausalLM.from_pretrained( -+ model = IPEXModelForCausalLM.from_pretrained( - "gpt2", - torch_dtype=torch.bfloat16, - export=True, -) + + model_id = "gpt2" +- model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16) ++ model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, export=True) tokenizer = AutoTokenizer.from_pretrained("gpt2") input_sentence = ["Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?"] model_inputs = tokenizer(input_sentence, return_tensors="pt") From 1a59c1bf3501cb994dca45db11f170245a86fa84 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Thu, 21 Mar 2024 10:20:26 -0400 Subject: [PATCH 06/14] fix ipex readme --- README.md | 51 +++++++++++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index e9838f2d1d..e824b867ae 100644 --- a/README.md +++ b/README.md @@ -44,30 +44,6 @@ where `extras` can be one or more of `ipex`, `neural-compressor`, `openvino`, `n # Quick tour -## IPEX -Here is the example of how to use IPEX optimized model to generate texts. -### generate -```diff -import torch -from transformers import AutoTokenizer -- from transformers import AutoModelForCausalLM -+ from optimum.intel.ipex import IPEXModelForCausalLM - - - model_id = "gpt2" -- model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16) -+ model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, export=True) -tokenizer = AutoTokenizer.from_pretrained("gpt2") -input_sentence = ["Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?"] -model_inputs = tokenizer(input_sentence, return_tensors="pt") -generation_kwargs = dict(max_new_tokens=32, do_sample=False, num_beams=4, num_beam_groups=1, no_repeat_ngram_size=2, use_cache=True) - -generated_ids = model.generate(**model_inputs, **generation_kwargs) -output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] -print(output) -``` - -For more details, please refer to the [documentation](https://intel.github.io/intel-extension-for-pytorch/#introduction). ## Neural Compressor @@ -227,6 +203,33 @@ Quantization aware training (QAT) is applied in order to simulate the effects of You can find more examples in the [documentation](https://huggingface.co/docs/optimum/intel/index). +## IPEX +With `export=True`, IPEX model will replace torch linear to ipex linear which prepacks the weights. It will also apply linear fusioin and [IAKV](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/llm.html#indirect-access-kv-cache) for generation. Finally, jit.trace will be applied to change the model to graph mode. +Here is the example of how to use IPEX optimized model to generate texts. +### generate +```diff +import torch +from transformers import AutoTokenizer +- from transformers import AutoModelForCausalLM ++ from optimum.intel.ipex import IPEXModelForCausalLM + + + model_id = "gpt2" +- model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16) ++ model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, export=True) +tokenizer = AutoTokenizer.from_pretrained("gpt2") +input_sentence = ["Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?"] +model_inputs = tokenizer(input_sentence, return_tensors="pt") +generation_kwargs = dict(max_new_tokens=32, do_sample=False, num_beams=4, num_beam_groups=1, no_repeat_ngram_size=2, use_cache=True) + +generated_ids = model.generate(**model_inputs, **generation_kwargs) +output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] +print(output) +``` + +For more details, please refer to the [documentation](https://intel.github.io/intel-extension-for-pytorch/#introduction). + + ## Running the examples Check out the [`examples`](https://github.com/huggingface/optimum-intel/tree/main/examples) directory to see how 🤗 Optimum Intel can be used to optimize models and accelerate inference. From 398870fb6091a2f179fd93f0c5521ec554633ed5 Mon Sep 17 00:00:00 2001 From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com> Date: Fri, 22 Mar 2024 08:48:15 +0800 Subject: [PATCH 07/14] Update README.md Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e824b867ae..8db0ca45bc 100644 --- a/README.md +++ b/README.md @@ -204,7 +204,7 @@ You can find more examples in the [documentation](https://huggingface.co/docs/op ## IPEX -With `export=True`, IPEX model will replace torch linear to ipex linear which prepacks the weights. It will also apply linear fusioin and [IAKV](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/llm.html#indirect-access-kv-cache) for generation. Finally, jit.trace will be applied to change the model to graph mode. +With `export=True`, IPEX model will replace torch linear to ipex linear which prepacks the weights. It will also apply linear fusion and [IAKV](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/llm.html#indirect-access-kv-cache) for generation. Finally, jit.trace will be applied to change the model to graph mode. Here is the example of how to use IPEX optimized model to generate texts. ### generate ```diff From 026cab4dcc13858185dc7d488d513c53e3db3305 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Fri, 22 Mar 2024 11:44:32 +0100 Subject: [PATCH 08/14] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8db0ca45bc..e8b0db8b2c 100644 --- a/README.md +++ b/README.md @@ -208,8 +208,8 @@ With `export=True`, IPEX model will replace torch linear to ipex linear which pr Here is the example of how to use IPEX optimized model to generate texts. ### generate ```diff -import torch -from transformers import AutoTokenizer + import torch + from transformers import AutoTokenizer, pipeline - from transformers import AutoModelForCausalLM + from optimum.intel.ipex import IPEXModelForCausalLM From 9302421bc0829995871696a02e87f7ca37547c7f Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Fri, 22 Mar 2024 11:44:39 +0100 Subject: [PATCH 09/14] Update README.md --- README.md | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index e8b0db8b2c..36812b7e33 100644 --- a/README.md +++ b/README.md @@ -217,14 +217,10 @@ Here is the example of how to use IPEX optimized model to generate texts. model_id = "gpt2" - model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16) + model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, export=True) -tokenizer = AutoTokenizer.from_pretrained("gpt2") -input_sentence = ["Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?"] -model_inputs = tokenizer(input_sentence, return_tensors="pt") -generation_kwargs = dict(max_new_tokens=32, do_sample=False, num_beams=4, num_beam_groups=1, no_repeat_ngram_size=2, use_cache=True) - -generated_ids = model.generate(**model_inputs, **generation_kwargs) -output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] -print(output) + tokenizer = AutoTokenizer.from_pretrained(model_id) + pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) + results = pipe("He's a dreadful magician and") + ``` For more details, please refer to the [documentation](https://intel.github.io/intel-extension-for-pytorch/#introduction). From 863f5ce156e8339a48a9098741d941c1b149f314 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Fri, 22 Mar 2024 11:45:16 +0100 Subject: [PATCH 10/14] Update README.md --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 36812b7e33..97587a9b0f 100644 --- a/README.md +++ b/README.md @@ -208,7 +208,6 @@ With `export=True`, IPEX model will replace torch linear to ipex linear which pr Here is the example of how to use IPEX optimized model to generate texts. ### generate ```diff - import torch from transformers import AutoTokenizer, pipeline - from transformers import AutoModelForCausalLM + from optimum.intel.ipex import IPEXModelForCausalLM From 6772655fd92575e04bbe1050cabc19fb3ccca35e Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Fri, 22 Mar 2024 11:45:29 +0100 Subject: [PATCH 11/14] Update README.md --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 97587a9b0f..ec3c1183fd 100644 --- a/README.md +++ b/README.md @@ -206,7 +206,6 @@ You can find more examples in the [documentation](https://huggingface.co/docs/op ## IPEX With `export=True`, IPEX model will replace torch linear to ipex linear which prepacks the weights. It will also apply linear fusion and [IAKV](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/llm.html#indirect-access-kv-cache) for generation. Finally, jit.trace will be applied to change the model to graph mode. Here is the example of how to use IPEX optimized model to generate texts. -### generate ```diff from transformers import AutoTokenizer, pipeline - from transformers import AutoModelForCausalLM From 5037688ca714fd58b833412aab2e10afb2a184d8 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Fri, 22 Mar 2024 12:11:10 +0100 Subject: [PATCH 12/14] Update README.md --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index ec3c1183fd..e7ec0d0e13 100644 --- a/README.md +++ b/README.md @@ -204,8 +204,7 @@ You can find more examples in the [documentation](https://huggingface.co/docs/op ## IPEX -With `export=True`, IPEX model will replace torch linear to ipex linear which prepacks the weights. It will also apply linear fusion and [IAKV](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/llm.html#indirect-access-kv-cache) for generation. Finally, jit.trace will be applied to change the model to graph mode. -Here is the example of how to use IPEX optimized model to generate texts. +To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. You can set `export=True` to load a PyTorch checkpoint, export your model via TorchScript and apply IPEX optimizations : both operators optimization (replaced with customized IPEX operators) and graph-level optimization (like operators fusion) will be applied on your model. ```diff from transformers import AutoTokenizer, pipeline - from transformers import AutoModelForCausalLM From 2bebcf2cf808124fde71b188c738f2ffffda25f7 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Fri, 22 Mar 2024 12:12:08 +0100 Subject: [PATCH 13/14] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e7ec0d0e13..1927a5f9af 100644 --- a/README.md +++ b/README.md @@ -208,7 +208,7 @@ To load your IPEX model, you can just replace your `AutoModelForXxx` class with ```diff from transformers import AutoTokenizer, pipeline - from transformers import AutoModelForCausalLM -+ from optimum.intel.ipex import IPEXModelForCausalLM ++ from optimum.intel import IPEXModelForCausalLM model_id = "gpt2" From 5fb90baf32952ca1c89b924b722c6aa11f1c933c Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Fri, 22 Mar 2024 12:12:36 +0100 Subject: [PATCH 14/14] Update README.md --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 1927a5f9af..c29a923745 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,6 @@ where `extras` can be one or more of `ipex`, `neural-compressor`, `openvino`, `n # Quick tour - ## Neural Compressor Dynamic quantization can be used through the Optimum command-line interface: