From 4871bbad25c1962f2d2a2ff9c4bb5643b51fca6f Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 11 Mar 2024 12:24:52 -0400
Subject: [PATCH 01/14] add ipex readme

---
 README.md | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/README.md b/README.md
index 7905cefded..ed4b60d1b6 100644
--- a/README.md
+++ b/README.md
@@ -44,6 +44,34 @@ where `extras` can be one or more of `ipex`, `neural-compressor`, `openvino`, `n
 
 # Quick tour
 
+## IPEX
+Below are examples of how to use IPEX model to generate texts.
+### generate
+```diff
+import torch
+from transformers import AutoTokenizer, AutoConfig
+- from transformers import AutoModelForCausalLM
++ from optimum.intel.ipex import IPEXModelForCausalLM
+
+config = AutoConfig.from_pretrained("gpt2")
+model = IPEXModelForCausalLM.from_pretrained(
+  "gpt2",
+  config=config,
+  torch_dtype=torch.bfloat16,
+  export=True,
+)
+tokenizer = AutoTokenizer.from_pretrained("gpt2")
+input_sentence = ["Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?"]
+model_inputs = tokenizer(input_sentence, return_tensors="pt")
+generation_kwargs = dict(max_new_tokens=32, do_sample=False, num_beams=4, num_beam_groups=1, no_repeat_ngram_size=2, use_cache=True)
+
+generated_ids = model.generate(**model_inputs, **generation_kwargs)
+output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+print(output)
+```
+
+For more details, please refer to the [documentation](https://intel.github.io/intel-extension-for-pytorch/#introduction).
+
 ## Neural Compressor
 
 Dynamic quantization can be used through the Optimum command-line interface:

From fe01c5704bed5771f62598ebb466d51ddf3a4011 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Wed, 20 Mar 2024 10:31:53 +0800
Subject: [PATCH 02/14] Update README.md

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ed4b60d1b6..142ea55d1e 100644
--- a/README.md
+++ b/README.md
@@ -54,7 +54,8 @@ from transformers import AutoTokenizer, AutoConfig
 + from optimum.intel.ipex import IPEXModelForCausalLM
 
 config = AutoConfig.from_pretrained("gpt2")
-model = IPEXModelForCausalLM.from_pretrained(
+- model = AutoModelForCausalLM.from_pretrained(
++ model = IPEXModelForCausalLM.from_pretrained(
   "gpt2",
   config=config,
   torch_dtype=torch.bfloat16,

From 8a49f9c42e8ae9efa54fa70fd722e768cc30b39c Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 20 Mar 2024 06:32:13 -0400
Subject: [PATCH 03/14] fix readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 142ea55d1e..a61f4afb84 100644
--- a/README.md
+++ b/README.md
@@ -45,7 +45,7 @@ where `extras` can be one or more of `ipex`, `neural-compressor`, `openvino`, `n
 # Quick tour
 
 ## IPEX
-Below are examples of how to use IPEX model to generate texts.
+Here is the example of how to use IPEX optimized model to generate texts.
 ### generate
 ```diff
 import torch

From 7570d715d1cf773e15d6a0d17d100d27e010a10e Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 20 Mar 2024 09:24:41 -0400
Subject: [PATCH 04/14] fix readme

---
 README.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/README.md b/README.md
index a61f4afb84..ff70c50169 100644
--- a/README.md
+++ b/README.md
@@ -49,15 +49,13 @@ Here is the example of how to use IPEX optimized model to generate texts.
 ### generate
 ```diff
 import torch
-from transformers import AutoTokenizer, AutoConfig
+from transformers import AutoTokenizer
 - from transformers import AutoModelForCausalLM
 + from optimum.intel.ipex import IPEXModelForCausalLM
 
-config = AutoConfig.from_pretrained("gpt2")
 - model = AutoModelForCausalLM.from_pretrained(
 + model = IPEXModelForCausalLM.from_pretrained(
   "gpt2",
-  config=config,
   torch_dtype=torch.bfloat16,
   export=True,
 )

From 0d312f10a991b64179c9cd1285c0df1685f3f03d Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Thu, 21 Mar 2024 14:03:53 +0800
Subject: [PATCH 05/14] Update README.md

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 README.md | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index ff70c50169..e9838f2d1d 100644
--- a/README.md
+++ b/README.md
@@ -53,12 +53,10 @@ from transformers import AutoTokenizer
 - from transformers import AutoModelForCausalLM
 + from optimum.intel.ipex import IPEXModelForCausalLM
 
-- model = AutoModelForCausalLM.from_pretrained(
-+ model = IPEXModelForCausalLM.from_pretrained(
-  "gpt2",
-  torch_dtype=torch.bfloat16,
-  export=True,
-)
+
+  model_id = "gpt2"
+- model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
++ model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, export=True)
 tokenizer = AutoTokenizer.from_pretrained("gpt2")
 input_sentence = ["Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?"]
 model_inputs = tokenizer(input_sentence, return_tensors="pt")

From 1a59c1bf3501cb994dca45db11f170245a86fa84 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 21 Mar 2024 10:20:26 -0400
Subject: [PATCH 06/14] fix ipex readme

---
 README.md | 51 +++++++++++++++++++++++++++------------------------
 1 file changed, 27 insertions(+), 24 deletions(-)

diff --git a/README.md b/README.md
index e9838f2d1d..e824b867ae 100644
--- a/README.md
+++ b/README.md
@@ -44,30 +44,6 @@ where `extras` can be one or more of `ipex`, `neural-compressor`, `openvino`, `n
 
 # Quick tour
 
-## IPEX
-Here is the example of how to use IPEX optimized model to generate texts.
-### generate
-```diff
-import torch
-from transformers import AutoTokenizer
-- from transformers import AutoModelForCausalLM
-+ from optimum.intel.ipex import IPEXModelForCausalLM
-
-
-  model_id = "gpt2"
-- model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
-+ model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, export=True)
-tokenizer = AutoTokenizer.from_pretrained("gpt2")
-input_sentence = ["Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?"]
-model_inputs = tokenizer(input_sentence, return_tensors="pt")
-generation_kwargs = dict(max_new_tokens=32, do_sample=False, num_beams=4, num_beam_groups=1, no_repeat_ngram_size=2, use_cache=True)
-
-generated_ids = model.generate(**model_inputs, **generation_kwargs)
-output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-print(output)
-```
-
-For more details, please refer to the [documentation](https://intel.github.io/intel-extension-for-pytorch/#introduction).
 
 ## Neural Compressor
 
@@ -227,6 +203,33 @@ Quantization aware training (QAT) is applied in order to simulate the effects of
 You can find more examples in the [documentation](https://huggingface.co/docs/optimum/intel/index).
 
 
+## IPEX
+With `export=True`, IPEX model will replace torch linear to ipex linear which prepacks the weights. It will also apply linear fusioin and [IAKV](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/llm.html#indirect-access-kv-cache) for generation. Finally, jit.trace will be applied to change the model to graph mode.
+Here is the example of how to use IPEX optimized model to generate texts.
+### generate
+```diff
+import torch
+from transformers import AutoTokenizer
+- from transformers import AutoModelForCausalLM
++ from optimum.intel.ipex import IPEXModelForCausalLM
+
+
+  model_id = "gpt2"
+- model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
++ model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, export=True)
+tokenizer = AutoTokenizer.from_pretrained("gpt2")
+input_sentence = ["Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?"]
+model_inputs = tokenizer(input_sentence, return_tensors="pt")
+generation_kwargs = dict(max_new_tokens=32, do_sample=False, num_beams=4, num_beam_groups=1, no_repeat_ngram_size=2, use_cache=True)
+
+generated_ids = model.generate(**model_inputs, **generation_kwargs)
+output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+print(output)
+```
+
+For more details, please refer to the [documentation](https://intel.github.io/intel-extension-for-pytorch/#introduction).
+
+
 ## Running the examples
 
 Check out the [`examples`](https://github.com/huggingface/optimum-intel/tree/main/examples) directory to see how 🤗 Optimum Intel can be used to optimize models and accelerate inference.

From 398870fb6091a2f179fd93f0c5521ec554633ed5 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Fri, 22 Mar 2024 08:48:15 +0800
Subject: [PATCH 07/14] Update README.md

Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e824b867ae..8db0ca45bc 100644
--- a/README.md
+++ b/README.md
@@ -204,7 +204,7 @@ You can find more examples in the [documentation](https://huggingface.co/docs/op
 
 
 ## IPEX
-With `export=True`, IPEX model will replace torch linear to ipex linear which prepacks the weights. It will also apply linear fusioin and [IAKV](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/llm.html#indirect-access-kv-cache) for generation. Finally, jit.trace will be applied to change the model to graph mode.
+With `export=True`, IPEX model will replace torch linear to ipex linear which prepacks the weights. It will also apply linear fusion and [IAKV](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/llm.html#indirect-access-kv-cache) for generation. Finally, jit.trace will be applied to change the model to graph mode.
 Here is the example of how to use IPEX optimized model to generate texts.
 ### generate
 ```diff

From 026cab4dcc13858185dc7d488d513c53e3db3305 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Fri, 22 Mar 2024 11:44:32 +0100
Subject: [PATCH 08/14] Update README.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 8db0ca45bc..e8b0db8b2c 100644
--- a/README.md
+++ b/README.md
@@ -208,8 +208,8 @@ With `export=True`, IPEX model will replace torch linear to ipex linear which pr
 Here is the example of how to use IPEX optimized model to generate texts.
 ### generate
 ```diff
-import torch
-from transformers import AutoTokenizer
+  import torch
+  from transformers import AutoTokenizer, pipeline
 - from transformers import AutoModelForCausalLM
 + from optimum.intel.ipex import IPEXModelForCausalLM
 

From 9302421bc0829995871696a02e87f7ca37547c7f Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Fri, 22 Mar 2024 11:44:39 +0100
Subject: [PATCH 09/14] Update README.md

---
 README.md | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index e8b0db8b2c..36812b7e33 100644
--- a/README.md
+++ b/README.md
@@ -217,14 +217,10 @@ Here is the example of how to use IPEX optimized model to generate texts.
   model_id = "gpt2"
 - model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)
 + model = IPEXModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, export=True)
-tokenizer = AutoTokenizer.from_pretrained("gpt2")
-input_sentence = ["Answer the following yes/no question by reasoning step-by-step please. Can you write a whole Haiku in a single tweet?"]
-model_inputs = tokenizer(input_sentence, return_tensors="pt")
-generation_kwargs = dict(max_new_tokens=32, do_sample=False, num_beams=4, num_beam_groups=1, no_repeat_ngram_size=2, use_cache=True)
-
-generated_ids = model.generate(**model_inputs, **generation_kwargs)
-output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-print(output)
+  tokenizer = AutoTokenizer.from_pretrained(model_id)
+  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+  results = pipe("He's a dreadful magician and")
+
 ```
 
 For more details, please refer to the [documentation](https://intel.github.io/intel-extension-for-pytorch/#introduction).

From 863f5ce156e8339a48a9098741d941c1b149f314 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Fri, 22 Mar 2024 11:45:16 +0100
Subject: [PATCH 10/14] Update README.md

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index 36812b7e33..97587a9b0f 100644
--- a/README.md
+++ b/README.md
@@ -208,7 +208,6 @@ With `export=True`, IPEX model will replace torch linear to ipex linear which pr
 Here is the example of how to use IPEX optimized model to generate texts.
 ### generate
 ```diff
-  import torch
   from transformers import AutoTokenizer, pipeline
 - from transformers import AutoModelForCausalLM
 + from optimum.intel.ipex import IPEXModelForCausalLM

From 6772655fd92575e04bbe1050cabc19fb3ccca35e Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Fri, 22 Mar 2024 11:45:29 +0100
Subject: [PATCH 11/14] Update README.md

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index 97587a9b0f..ec3c1183fd 100644
--- a/README.md
+++ b/README.md
@@ -206,7 +206,6 @@ You can find more examples in the [documentation](https://huggingface.co/docs/op
 ## IPEX
 With `export=True`, IPEX model will replace torch linear to ipex linear which prepacks the weights. It will also apply linear fusion and [IAKV](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/llm.html#indirect-access-kv-cache) for generation. Finally, jit.trace will be applied to change the model to graph mode.
 Here is the example of how to use IPEX optimized model to generate texts.
-### generate
 ```diff
   from transformers import AutoTokenizer, pipeline
 - from transformers import AutoModelForCausalLM

From 5037688ca714fd58b833412aab2e10afb2a184d8 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Fri, 22 Mar 2024 12:11:10 +0100
Subject: [PATCH 12/14] Update README.md

---
 README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/README.md b/README.md
index ec3c1183fd..e7ec0d0e13 100644
--- a/README.md
+++ b/README.md
@@ -204,8 +204,7 @@ You can find more examples in the [documentation](https://huggingface.co/docs/op
 
 
 ## IPEX
-With `export=True`, IPEX model will replace torch linear to ipex linear which prepacks the weights. It will also apply linear fusion and [IAKV](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/llm.html#indirect-access-kv-cache) for generation. Finally, jit.trace will be applied to change the model to graph mode.
-Here is the example of how to use IPEX optimized model to generate texts.
+To load your IPEX model, you can just replace your `AutoModelForXxx` class with the corresponding `IPEXModelForXxx` class. You can set `export=True` to load a PyTorch checkpoint, export your model via TorchScript and apply IPEX optimizations : both operators optimization (replaced with customized IPEX operators) and graph-level optimization (like operators fusion) will be applied on your model.
 ```diff
   from transformers import AutoTokenizer, pipeline
 - from transformers import AutoModelForCausalLM

From 2bebcf2cf808124fde71b188c738f2ffffda25f7 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Fri, 22 Mar 2024 12:12:08 +0100
Subject: [PATCH 13/14] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e7ec0d0e13..1927a5f9af 100644
--- a/README.md
+++ b/README.md
@@ -208,7 +208,7 @@ To load your IPEX model, you can just replace your `AutoModelForXxx` class with
 ```diff
   from transformers import AutoTokenizer, pipeline
 - from transformers import AutoModelForCausalLM
-+ from optimum.intel.ipex import IPEXModelForCausalLM
++ from optimum.intel import IPEXModelForCausalLM
 
 
   model_id = "gpt2"

From 5fb90baf32952ca1c89b924b722c6aa11f1c933c Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Fri, 22 Mar 2024 12:12:36 +0100
Subject: [PATCH 14/14] Update README.md

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index 1927a5f9af..c29a923745 100644
--- a/README.md
+++ b/README.md
@@ -44,7 +44,6 @@ where `extras` can be one or more of `ipex`, `neural-compressor`, `openvino`, `n
 
 # Quick tour
 
-
 ## Neural Compressor
 
 Dynamic quantization can be used through the Optimum command-line interface: