From 5351f4a1d411b6813e8ed47a826ffdc667491518 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Thu, 23 May 2024 13:05:07 -0400 Subject: [PATCH 01/49] ipex 2.3 released --- optimum/exporters/ipex/model_patcher.py | 8 ++++---- optimum/exporters/ipex/modeling_utils.py | 17 ++++++++++------- optimum/intel/ipex/modeling_base.py | 2 +- tests/ipex/test_modeling.py | 16 ---------------- 4 files changed, 15 insertions(+), 28 deletions(-) diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py index 60ff3b721b..3996c2b23f 100644 --- a/optimum/exporters/ipex/model_patcher.py +++ b/optimum/exporters/ipex/model_patcher.py @@ -62,10 +62,10 @@ def patch_op(m, target_m, new_op_name, new_op): def _patch_llama_model(model): - if is_ipex_version("<", "2.5.0"): - raise ImportError("Only ipex version > 2.3.0 supports RotaryEmbedding and IndirectAccessKVCache") + if is_ipex_version("<", "2.3.0"): + raise ImportError("Only ipex version >= 2.3.0 supports RotaryEmbedding and IndirectAccessKVCacheAttention") - from intel_extension_for_pytorch.llm.modules import IndirectAccessKVCache, RotaryEmbedding + from intel_extension_for_pytorch.llm.modules import IndirectAccessKVCacheAttention, RotaryEmbedding ipex_rope = RotaryEmbedding( model.config.max_position_embeddings, @@ -73,7 +73,7 @@ def _patch_llama_model(model): model.config.rope_theta, model.config.architectures[0], ) - ipex_scale_dot_product = IndirectAccessKVCache(text_max_length=model.config.max_position_embeddings) + ipex_scale_dot_product = IndirectAccessKVCacheAttention(text_max_length=model.config.max_position_embeddings) patch_op(model, LlamaAttention, "ipex_rope", ipex_rope) patch_op(model, LlamaAttention, "ipex_scale_dot_product", ipex_scale_dot_product) diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py index f75e559eaf..9d53caf4fc 100644 --- a/optimum/exporters/ipex/modeling_utils.py +++ b/optimum/exporters/ipex/modeling_utils.py @@ -219,7 +219,7 @@ def _llama_model_forward( # Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L694 class _IPEXLlamaDecoderLayerRef(nn.Module): def __init__(self, module, config, distributed=False): - if is_ipex_version("<", "2.5.0"): + if is_ipex_version("<", "2.3.0"): raise ImportError("Only ipex version > 2.3.0 supports Linear2SiluMul and LinearAdd") from intel_extension_for_pytorch.llm.modules import Linear2SiluMul, LinearAdd @@ -278,7 +278,7 @@ def forward( output_attentions=output_attentions, use_cache=use_cache, ) - if not self.distributed: + if hasattr(self, "mha_linear_add"): hidden_states = self.mha_linear_add(hidden_states, residual) else: hidden_states = self.self_attn.o_proj(hidden_states) @@ -288,12 +288,15 @@ def forward( residual = hidden_states hidden_states = self.post_attention_layernorm(hidden_states) - mlp_gate = self.linear_silu_mul(hidden_states) - - if not self.distributed: - hidden_states = self.mlp_linear_add(mlp_gate, residual) + if hasattr(self, "linear_silu_mul"): + mlp_gate = self.linear_silu_mul(hidden_states) + if hasattr(self, "mlp_linear_add"): + hidden_states = self.mlp_linear_add(mlp_gate, residual) + else: + hidden_states = self.mlp.down_proj(mlp_gate) + hidden_states = residual + hidden_states else: - hidden_states = self.mlp.down_proj(mlp_gate) + hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states outputs = (hidden_states,) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index e929a4ddb8..c9d43e3dc0 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -63,7 +63,7 @@ def _is_patched_with_ipex(model, task): - if is_ipex_version("<", "2.5.0"): + if is_ipex_version("<", "2.3.0"): return False if isinstance(model, torch.jit.ScriptModule): diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index 2a2f18f6f8..7eb34ef47c 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -219,22 +219,6 @@ def test_pipeline(self, model_arch): self.assertEqual(pipe.device, model.device) self.assertTrue(all("This is a sample" in item["generated_text"] for item in outputs)) - @parameterized.expand(SUPPORTED_ARCHITECTURES) - def test_assisted_decoding(self, model_arch): - model_id = MODEL_NAMES[model_arch] - tokenizer = AutoTokenizer.from_pretrained(model_id) - ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, export=True) - transformers_model = AutoModelForCausalLM.from_pretrained(model_id) - tokens = tokenizer("This is a sample input", return_tensors="pt") - ipex_output = ipex_model.generate(**tokens, do_sample=False) - ipex_output_assisted = ipex_model.generate(**tokens, do_sample=False, assistant_model=transformers_model) - transformers_output = transformers_model.generate(**tokens, do_sample=False) - transformers_output_assisted = transformers_model.generate( - **tokens, do_sample=False, assistant_model=ipex_model - ) - self.assertTrue(torch.equal(ipex_output, ipex_output_assisted)) - self.assertTrue(torch.equal(transformers_output, transformers_output_assisted)) - @parameterized.expand( grid_parameters( { From 1f98d6d773cce4157e00a941b594db0be97696fd Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Mon, 27 May 2024 11:07:16 -0400 Subject: [PATCH 02/49] skip tests --- tests/ipex/test_modeling.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index 7eb34ef47c..2948d383f0 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -219,6 +219,23 @@ def test_pipeline(self, model_arch): self.assertEqual(pipe.device, model.device) self.assertTrue(all("This is a sample" in item["generated_text"] for item in outputs)) + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @unittest.skip("CPU IPEXModel does not support assisted decoding for now.") + def test_assisted_decoding(self, model_arch): + model_id = MODEL_NAMES[model_arch] + tokenizer = AutoTokenizer.from_pretrained(model_id) + ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, export=True) + transformers_model = AutoModelForCausalLM.from_pretrained(model_id) + tokens = tokenizer("This is a sample input", return_tensors="pt") + ipex_output = ipex_model.generate(**tokens, do_sample=False) + ipex_output_assisted = ipex_model.generate(**tokens, do_sample=False, assistant_model=transformers_model) + transformers_output = transformers_model.generate(**tokens, do_sample=False) + transformers_output_assisted = transformers_model.generate( + **tokens, do_sample=False, assistant_model=ipex_model + ) + self.assertTrue(torch.equal(ipex_output, ipex_output_assisted)) + self.assertTrue(torch.equal(transformers_output, transformers_output_assisted)) + @parameterized.expand( grid_parameters( { From b2b93bb12b71a9bd0e6e4f3ed96261d05a664b2d Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Mon, 27 May 2024 11:35:25 -0400 Subject: [PATCH 03/49] skip testing without pkv --- tests/ipex/test_modeling.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index 2948d383f0..043d1e761c 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -244,7 +244,7 @@ def test_assisted_decoding(self, model_arch): } ) ) - @unittest.skipIf(is_ipex_version("<", "2.5.0"), reason="Only ipex version > 2.3.0 supports ipex model patching") + @unittest.skipIf(is_ipex_version("<", "2.3.0"), reason="Only ipex version >= 2.3.0 supports ipex model patching") def test_ipex_patching_beam_search(self, test_name, model_arch, use_cache): model_id = MODEL_NAMES[model_arch] set_seed(SEED) @@ -271,6 +271,7 @@ def test_ipex_patching_beam_search(self, test_name, model_arch, use_cache): self.assertIsInstance(outputs, torch.Tensor) self.assertEqual(outputs, transformers_outputs) + @unittest.skip("CPU IPEXModel only supports with past_key_values.") def test_compare_with_and_without_past_key_values(self): model_id = "echarlaix/tiny-random-gpt2-torchscript" tokenizer = AutoTokenizer.from_pretrained(model_id) From 64dcde4f6d42e1c14a797580760147abcb4eee8f Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Mon, 27 May 2024 12:11:59 -0400 Subject: [PATCH 04/49] add tests skip --- tests/ipex/test_modeling.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index 043d1e761c..5646d59db1 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -220,7 +220,7 @@ def test_pipeline(self, model_arch): self.assertTrue(all("This is a sample" in item["generated_text"] for item in outputs)) @parameterized.expand(SUPPORTED_ARCHITECTURES) - @unittest.skip("CPU IPEXModel does not support assisted decoding for now.") + @unittest.skipIf(is_ipex_version(">=", "2.3.0"), reason="CPU IPEXModel does not support assisted decoding when ipex version >= 2.3.0") def test_assisted_decoding(self, model_arch): model_id = MODEL_NAMES[model_arch] tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -236,20 +236,12 @@ def test_assisted_decoding(self, model_arch): self.assertTrue(torch.equal(ipex_output, ipex_output_assisted)) self.assertTrue(torch.equal(transformers_output, transformers_output_assisted)) - @parameterized.expand( - grid_parameters( - { - "model_arch": IPEX_PATCHED_SUPPORTED_ARCHITECTURES, - "use_cache": [True, False], - } - ) - ) + @parameterized.expand(IPEX_PATCHED_SUPPORTED_ARCHITECTURES) @unittest.skipIf(is_ipex_version("<", "2.3.0"), reason="Only ipex version >= 2.3.0 supports ipex model patching") - def test_ipex_patching_beam_search(self, test_name, model_arch, use_cache): + def test_ipex_patching_beam_search(self, model_arch): model_id = MODEL_NAMES[model_arch] set_seed(SEED) - model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, use_cache=use_cache) - self.assertEqual(model.use_cache, use_cache) + model = IPEXModelForCausalLM.from_pretrained(model_id, export=True) trasnformers_model = AutoModelForCausalLM.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) tokenizer.pad_token = tokenizer.eos_token @@ -260,7 +252,7 @@ def test_ipex_patching_beam_search(self, test_name, model_arch, use_cache): GenerationConfig(max_new_tokens=4, num_beams=4, do_sample=True), GenerationConfig(max_new_tokens=4, num_beams=8, do_sample=True), GenerationConfig(max_new_tokens=4, num_beams=32, do_sample=True), - GenerationConfig(max_new_tokens=4, do_sample=not use_cache, top_p=1.0, top_k=5, penalty_alpha=0.6), + GenerationConfig(max_new_tokens=4, do_sample=True, top_p=1.0, top_k=5, penalty_alpha=0.6), GenerationConfig(max_new_tokens=4, do_sample=True, top_p=0.9, top_k=0), ) for text in texts: @@ -271,7 +263,7 @@ def test_ipex_patching_beam_search(self, test_name, model_arch, use_cache): self.assertIsInstance(outputs, torch.Tensor) self.assertEqual(outputs, transformers_outputs) - @unittest.skip("CPU IPEXModel only supports with past_key_values.") + @unittest.skipIf(is_ipex_version(">=", "2.3.0"), reason="CPU IPEXModel only supports with past_key_values for ipex version >= 2.3.0") def test_compare_with_and_without_past_key_values(self): model_id = "echarlaix/tiny-random-gpt2-torchscript" tokenizer = AutoTokenizer.from_pretrained(model_id) From 945f6b6a958ad560fe32bf33b43b3bd4ec113625 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Mon, 27 May 2024 12:26:58 -0400 Subject: [PATCH 05/49] only llama2 with at least 64 head size support IAKV --- tests/ipex/test_modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index 5646d59db1..428e5d3da9 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -178,7 +178,7 @@ class IPEXModelForCausalLMTest(unittest.TestCase): "mpt", "opt", ) - IPEX_PATCHED_SUPPORTED_ARCHITECTURES = ("llama",) + IPEX_PATCHED_SUPPORTED_ARCHITECTURES = ("llama2",) GENERATION_LENGTH = 100 SPEEDUP_CACHE = 1.0 From c8922f3fd3c51fcea2e2e26b3234dc400ee39c38 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Mon, 27 May 2024 12:55:57 -0400 Subject: [PATCH 06/49] cannot assert same outputs cause do_sample=True --- tests/ipex/test_modeling.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index 428e5d3da9..975821eaf2 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -259,9 +259,7 @@ def test_ipex_patching_beam_search(self, model_arch): tokens = tokenizer(text, padding=True, return_tensors="pt") for generation_config in generation_configs: outputs = model.generate(**tokens, generation_config=generation_config) - transformers_outputs = trasnformers_model.generate(**tokens, generation_config=generation_config) self.assertIsInstance(outputs, torch.Tensor) - self.assertEqual(outputs, transformers_outputs) @unittest.skipIf(is_ipex_version(">=", "2.3.0"), reason="CPU IPEXModel only supports with past_key_values for ipex version >= 2.3.0") def test_compare_with_and_without_past_key_values(self): From 2ddfa7a679d96f10d1f8e05ea57f791bed75129a Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Mon, 27 May 2024 13:25:23 -0400 Subject: [PATCH 07/49] rm tiny-llama model testing cause it not work for IAKV --- tests/ipex/test_modeling.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index 975821eaf2..fddcb548eb 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -171,7 +171,6 @@ class IPEXModelForCausalLMTest(unittest.TestCase): "gpt2", "gpt_neo", "gpt_neox", - "llama", "llama2", "mistral", # "phi", @@ -242,7 +241,6 @@ def test_ipex_patching_beam_search(self, model_arch): model_id = MODEL_NAMES[model_arch] set_seed(SEED) model = IPEXModelForCausalLM.from_pretrained(model_id, export=True) - trasnformers_model = AutoModelForCausalLM.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) tokenizer.pad_token = tokenizer.eos_token # Test with batch_size is 1 and 2. From f4e887d4089804c851260da3ab90545736c69ce0 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Tue, 28 May 2024 05:31:32 -0400 Subject: [PATCH 08/49] fix code style --- tests/ipex/test_modeling.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index fddcb548eb..aebebda101 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -219,7 +219,10 @@ def test_pipeline(self, model_arch): self.assertTrue(all("This is a sample" in item["generated_text"] for item in outputs)) @parameterized.expand(SUPPORTED_ARCHITECTURES) - @unittest.skipIf(is_ipex_version(">=", "2.3.0"), reason="CPU IPEXModel does not support assisted decoding when ipex version >= 2.3.0") + @unittest.skipIf( + is_ipex_version(">=", "2.3.0"), + reason="CPU IPEXModel does not support assisted decoding when ipex version >= 2.3.0", + ) def test_assisted_decoding(self, model_arch): model_id = MODEL_NAMES[model_arch] tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -259,7 +262,10 @@ def test_ipex_patching_beam_search(self, model_arch): outputs = model.generate(**tokens, generation_config=generation_config) self.assertIsInstance(outputs, torch.Tensor) - @unittest.skipIf(is_ipex_version(">=", "2.3.0"), reason="CPU IPEXModel only supports with past_key_values for ipex version >= 2.3.0") + @unittest.skipIf( + is_ipex_version(">=", "2.3.0"), + reason="CPU IPEXModel only supports with past_key_values for ipex version >= 2.3.0", + ) def test_compare_with_and_without_past_key_values(self): model_id = "echarlaix/tiny-random-gpt2-torchscript" tokenizer = AutoTokenizer.from_pretrained(model_id) From d96ea583afd4b1f9ce55301d6c542769ed821dc4 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Tue, 28 May 2024 05:32:10 -0400 Subject: [PATCH 09/49] fix style --- optimum/exporters/openvino/model_patcher.py | 12 ++++++------ optimum/intel/openvino/modeling_decoder.py | 8 +++++--- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 93a8430522..a0b20e4905 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -349,9 +349,9 @@ def _llama_gemma_update_causal_mask(self, attention_mask, input_tensor, cache_po offset = 0 mask_shape = attention_mask.shape mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype - causal_mask[ - : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] - ] = mask_slice + causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = ( + mask_slice + ) if ( self.config._attn_implementation == "sdpa" @@ -1524,9 +1524,9 @@ def _dbrx_update_causal_mask_legacy( offset = 0 mask_shape = attention_mask.shape mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype - causal_mask[ - : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] - ] = mask_slice + causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = ( + mask_slice + ) if ( self.config._attn_implementation == "sdpa" diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 933d92a502..2c02c16d2b 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -560,9 +560,11 @@ def _expand_outputs_for_generation(self, indicies, logits: torch.Tensor, past_ke ): past_key_values = tuple( tuple( - past_state[indicies] - if not self.config.model_type == "chatglm" - else past_state[:, indicies, ...] + ( + past_state[indicies] + if not self.config.model_type == "chatglm" + else past_state[:, indicies, ...] + ) for past_state in layer_past ) for layer_past in past_key_values From ec24d5aec8fe752df4e487ad4962cdbc7b8d241a Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Tue, 28 May 2024 13:48:03 -0400 Subject: [PATCH 10/49] rm tiny llama on test pipeline --- tests/ipex/test_pipelines.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/ipex/test_pipelines.py b/tests/ipex/test_pipelines.py index c4ae471a0f..e571d5a6a1 100644 --- a/tests/ipex/test_pipelines.py +++ b/tests/ipex/test_pipelines.py @@ -56,7 +56,6 @@ class PipelinesIntegrationTest(unittest.TestCase): "gpt2", "gpt_neo", "gpt_neox", - "llama", "llama2", "mistral", "mpt", From 871de7b612a2488cf0518f6b259574e42eeedbb4 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Thu, 30 May 2024 05:50:47 -0400 Subject: [PATCH 11/49] fix tests --- tests/ipex/test_modeling.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index aebebda101..7d38b5dc34 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -171,7 +171,6 @@ class IPEXModelForCausalLMTest(unittest.TestCase): "gpt2", "gpt_neo", "gpt_neox", - "llama2", "mistral", # "phi", "mpt", @@ -181,7 +180,7 @@ class IPEXModelForCausalLMTest(unittest.TestCase): GENERATION_LENGTH = 100 SPEEDUP_CACHE = 1.0 - @parameterized.expand(SUPPORTED_ARCHITECTURES) + @parameterized.expand(SUPPORTED_ARCHITECTURES + IPEX_PATCHED_SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): model_id = MODEL_NAMES[model_arch] set_seed(SEED) @@ -206,7 +205,7 @@ def test_compare_to_transformers(self, model_arch): # Compare tensor outputs self.assertTrue(torch.allclose(outputs.logits, transformers_outputs.logits, atol=1e-4)) - @parameterized.expand(SUPPORTED_ARCHITECTURES) + @parameterized.expand(SUPPORTED_ARCHITECTURES + IPEX_PATCHED_SUPPORTED_ARCHITECTURES) def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -218,11 +217,8 @@ def test_pipeline(self, model_arch): self.assertEqual(pipe.device, model.device) self.assertTrue(all("This is a sample" in item["generated_text"] for item in outputs)) + # High optimized model llama is not supported assisted decoding for now. @parameterized.expand(SUPPORTED_ARCHITECTURES) - @unittest.skipIf( - is_ipex_version(">=", "2.3.0"), - reason="CPU IPEXModel does not support assisted decoding when ipex version >= 2.3.0", - ) def test_assisted_decoding(self, model_arch): model_id = MODEL_NAMES[model_arch] tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -262,10 +258,6 @@ def test_ipex_patching_beam_search(self, model_arch): outputs = model.generate(**tokens, generation_config=generation_config) self.assertIsInstance(outputs, torch.Tensor) - @unittest.skipIf( - is_ipex_version(">=", "2.3.0"), - reason="CPU IPEXModel only supports with past_key_values for ipex version >= 2.3.0", - ) def test_compare_with_and_without_past_key_values(self): model_id = "echarlaix/tiny-random-gpt2-torchscript" tokenizer = AutoTokenizer.from_pretrained(model_id) From d0c8951fd32bbaa3b8ef9827161a101964067b62 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Thu, 30 May 2024 08:37:36 -0400 Subject: [PATCH 12/49] support use_cache=False --- optimum/exporters/ipex/modeling_utils.py | 42 +++++++++++---------- optimum/exporters/openvino/model_patcher.py | 12 +++--- optimum/intel/ipex/modeling_base.py | 4 ++ tests/ipex/test_modeling.py | 17 ++++++--- 4 files changed, 44 insertions(+), 31 deletions(-) diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py index 9d53caf4fc..f1bf09bc4e 100644 --- a/optimum/exporters/ipex/modeling_utils.py +++ b/optimum/exporters/ipex/modeling_utils.py @@ -19,7 +19,7 @@ from torch import nn from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask from transformers.modeling_outputs import BaseModelOutputWithPast -from transformers.models.llama.modeling_llama import repeat_kv +from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv from optimum.intel.utils.import_utils import is_ipex_version @@ -51,27 +51,27 @@ def _llama_attn_forward( query = query.view(bsz, q_len, self.num_heads, self.head_dim) key = key.view(bsz, q_len, self.num_key_value_heads, self.head_dim) value = value.view(bsz, q_len, self.num_key_value_heads, self.head_dim) - # Use ipex op to rotary position embedding more efficient. - key = self.ipex_rope( - key, - position_ids, - self.num_key_value_heads, - self.head_dim, - self.head_dim // 2, - self.head_dim, - kv_seq_len, - ) - query = self.ipex_rope( - query, - position_ids, - self.num_heads, - self.head_dim, - self.head_dim // 2, - self.head_dim, - kv_seq_len, - ) if use_cache: + # Use ipex op to rotary position embedding more efficient. + key = self.ipex_rope( + key, + position_ids, + self.num_key_value_heads, + self.head_dim, + self.head_dim // 2, + self.head_dim, + kv_seq_len, + ) + query = self.ipex_rope( + query, + position_ids, + self.num_heads, + self.head_dim, + self.head_dim // 2, + self.head_dim, + kv_seq_len, + ) # This ipex op pre-allocates buffers for past_key_values and use beam index history # which to decide which beam should be used to make attention scale dot more efficient. (attn_output, attn_weights, past_key_value) = self.ipex_scale_dot_product( @@ -87,6 +87,8 @@ def _llama_attn_forward( value_states = value.transpose(1, 2) query_states = query.transpose(1, 2) key_states = key.transpose(1, 2) + cos, sin = self.rotary_emb(value_states, position_ids) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) kv_seq_len = key_states.shape[-2] past_key_value = None diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index a0b20e4905..93a8430522 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -349,9 +349,9 @@ def _llama_gemma_update_causal_mask(self, attention_mask, input_tensor, cache_po offset = 0 mask_shape = attention_mask.shape mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype - causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = ( - mask_slice - ) + causal_mask[ + : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] + ] = mask_slice if ( self.config._attn_implementation == "sdpa" @@ -1524,9 +1524,9 @@ def _dbrx_update_causal_mask_legacy( offset = 0 mask_shape = attention_mask.shape mask_slice = (attention_mask.eq(0.0)).to(dtype=dtype) * min_dtype - causal_mask[: mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3]] = ( - mask_slice - ) + causal_mask[ + : mask_shape[0], : mask_shape[1], offset : mask_shape[2] + offset, : mask_shape[3] + ] = mask_slice if ( self.config._attn_implementation == "sdpa" diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index c9d43e3dc0..fa3893ee06 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -83,7 +83,10 @@ def ipex_jit_trace(model, task, use_cache): if _is_patched_with_ipex(model, task): model = _patch_model(model) + # Todo: integerate in prepare_jit_inputs. sample_inputs = get_dummy_input(model, return_dict=True) + if not use_cache: + sample_inputs.pop("past_key_values") # Use Tensor Processing Primitives to accelerate linear, see https://arxiv.org/abs/2104.05755. _enable_tpp() else: @@ -193,6 +196,7 @@ def _from_transformers( "torch_dtype": torch_dtype, "trust_remote_code": trust_remote_code, "_commit_hash": _commit_hash, + "use_cache": use_cache, } model = TasksManager.get_model_from_task(task, model_id, **model_kwargs) diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index 7d38b5dc34..c2c418f45d 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -234,12 +234,20 @@ def test_assisted_decoding(self, model_arch): self.assertTrue(torch.equal(ipex_output, ipex_output_assisted)) self.assertTrue(torch.equal(transformers_output, transformers_output_assisted)) - @parameterized.expand(IPEX_PATCHED_SUPPORTED_ARCHITECTURES) - @unittest.skipIf(is_ipex_version("<", "2.3.0"), reason="Only ipex version >= 2.3.0 supports ipex model patching") - def test_ipex_patching_beam_search(self, model_arch): + @parameterized.expand( + grid_parameters( + { + "model_arch": IPEX_PATCHED_SUPPORTED_ARCHITECTURES, + "use_cache": [True, False], + } + ) + ) + @unittest.skipIf(is_ipex_version("<", "2.3.0"), reason="Only ipex version > 2.3.0 supports ipex model patching") + def test_ipex_patching_beam_search(self, test_name, model_arch, use_cache): model_id = MODEL_NAMES[model_arch] set_seed(SEED) - model = IPEXModelForCausalLM.from_pretrained(model_id, export=True) + model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, use_cache=use_cache) + self.assertEqual(model.use_cache, use_cache) tokenizer = AutoTokenizer.from_pretrained(model_id) tokenizer.pad_token = tokenizer.eos_token # Test with batch_size is 1 and 2. @@ -249,7 +257,6 @@ def test_ipex_patching_beam_search(self, model_arch): GenerationConfig(max_new_tokens=4, num_beams=4, do_sample=True), GenerationConfig(max_new_tokens=4, num_beams=8, do_sample=True), GenerationConfig(max_new_tokens=4, num_beams=32, do_sample=True), - GenerationConfig(max_new_tokens=4, do_sample=True, top_p=1.0, top_k=5, penalty_alpha=0.6), GenerationConfig(max_new_tokens=4, do_sample=True, top_p=0.9, top_k=0), ) for text in texts: From 537f0aaea302f2be5fec6c0b261c1f60d1c1aa00 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Thu, 30 May 2024 09:38:15 -0400 Subject: [PATCH 13/49] rm use_cache in model_kwargs --- optimum/intel/ipex/modeling_base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index fa3893ee06..b6eef13af9 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -196,7 +196,6 @@ def _from_transformers( "torch_dtype": torch_dtype, "trust_remote_code": trust_remote_code, "_commit_hash": _commit_hash, - "use_cache": use_cache, } model = TasksManager.get_model_from_task(task, model_id, **model_kwargs) From 5a7179011f6a4e78d92afbe669325e43fe549b23 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Thu, 30 May 2024 10:14:14 -0400 Subject: [PATCH 14/49] set use_cache --- optimum/intel/ipex/modeling_base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index b6eef13af9..af94410deb 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -98,6 +98,8 @@ def ipex_jit_trace(model, task, use_cache): if "past_key_values" in sample_inputs and use_cache: # Make sure the model will output past_key_values in generation tasks model.config.use_cache = True + else: + model.config.use_cache = False model = ipex.optimize(model.eval(), dtype=model.dtype, inplace=True) # Disable repack while jit tracing to reduce the memory From bde814e4892771d630da4e649562ab6ad2683819 Mon Sep 17 00:00:00 2001 From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com> Date: Fri, 31 May 2024 08:50:09 +0800 Subject: [PATCH 15/49] Update optimum/intel/ipex/modeling_base.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- optimum/intel/ipex/modeling_base.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index af94410deb..8a06a2307c 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -95,11 +95,7 @@ def ipex_jit_trace(model, task, use_cache): model.config.return_dict = False - if "past_key_values" in sample_inputs and use_cache: - # Make sure the model will output past_key_values in generation tasks - model.config.use_cache = True - else: - model.config.use_cache = False + model.config.use_cache = past_key_values" in sample_inputs model = ipex.optimize(model.eval(), dtype=model.dtype, inplace=True) # Disable repack while jit tracing to reduce the memory From 4a81ea95e09cc35bbcd3415c749ec23592e13028 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Fri, 31 May 2024 04:51:25 -0400 Subject: [PATCH 16/49] fix spelling error --- optimum/intel/ipex/modeling_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index 8a06a2307c..0d9ab0924f 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -95,7 +95,7 @@ def ipex_jit_trace(model, task, use_cache): model.config.return_dict = False - model.config.use_cache = past_key_values" in sample_inputs + model.config.use_cache = "past_key_values" in sample_inputs model = ipex.optimize(model.eval(), dtype=model.dtype, inplace=True) # Disable repack while jit tracing to reduce the memory From 3a61e845d633c76ef52652ffacf03ebfd3800861 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Fri, 31 May 2024 04:55:38 -0400 Subject: [PATCH 17/49] fix style --- optimum/intel/ipex/modeling_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index 0d9ab0924f..b8988f2db0 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -95,7 +95,7 @@ def ipex_jit_trace(model, task, use_cache): model.config.return_dict = False - model.config.use_cache = "past_key_values" in sample_inputs + model.config.use_cache = "past_key_values" in sample_inputs model = ipex.optimize(model.eval(), dtype=model.dtype, inplace=True) # Disable repack while jit tracing to reduce the memory From fd694074425c819111f4d9a65bfd524e0a9cb2a2 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Fri, 31 May 2024 05:05:42 -0400 Subject: [PATCH 18/49] add transformers version warning --- optimum/exporters/ipex/modeling_utils.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py index f1bf09bc4e..3553062834 100644 --- a/optimum/exporters/ipex/modeling_utils.py +++ b/optimum/exporters/ipex/modeling_utils.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging import math from typing import List, Optional, Tuple, Union @@ -21,7 +22,10 @@ from transformers.modeling_outputs import BaseModelOutputWithPast from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv -from optimum.intel.utils.import_utils import is_ipex_version +from optimum.intel.utils.import_utils import is_ipex_version, is_transformers_version + + +logger = logging.getLogger(__name__) # Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L83 @@ -223,6 +227,10 @@ class _IPEXLlamaDecoderLayerRef(nn.Module): def __init__(self, module, config, distributed=False): if is_ipex_version("<", "2.3.0"): raise ImportError("Only ipex version > 2.3.0 supports Linear2SiluMul and LinearAdd") + if is_transformers_version("<", "4.38.2") or is_transformers_version(">", "4.41.2"): + logger.warning( + "The verified transformers version is 4.38.2 ~ 4.41.2. It may cause unexpected error if out of this interval" + ) from intel_extension_for_pytorch.llm.modules import Linear2SiluMul, LinearAdd From 1032a261ee1f802c63f5f711d62fc4ba0ac2d1ea Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Fri, 31 May 2024 05:22:27 -0400 Subject: [PATCH 19/49] add compare resultes --- optimum/intel/openvino/modeling_decoder.py | 8 +++----- tests/ipex/test_modeling.py | 13 ++++++++----- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/optimum/intel/openvino/modeling_decoder.py b/optimum/intel/openvino/modeling_decoder.py index 2c02c16d2b..933d92a502 100644 --- a/optimum/intel/openvino/modeling_decoder.py +++ b/optimum/intel/openvino/modeling_decoder.py @@ -560,11 +560,9 @@ def _expand_outputs_for_generation(self, indicies, logits: torch.Tensor, past_ke ): past_key_values = tuple( tuple( - ( - past_state[indicies] - if not self.config.model_type == "chatglm" - else past_state[:, indicies, ...] - ) + past_state[indicies] + if not self.config.model_type == "chatglm" + else past_state[:, indicies, ...] for past_state in layer_past ) for layer_past in past_key_values diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index c2c418f45d..ff764a4ace 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -247,23 +247,26 @@ def test_ipex_patching_beam_search(self, test_name, model_arch, use_cache): model_id = MODEL_NAMES[model_arch] set_seed(SEED) model = IPEXModelForCausalLM.from_pretrained(model_id, export=True, use_cache=use_cache) + trasnformers_model = AutoModelForCausalLM.from_pretrained(model_id) self.assertEqual(model.use_cache, use_cache) tokenizer = AutoTokenizer.from_pretrained(model_id) tokenizer.pad_token = tokenizer.eos_token # Test with batch_size is 1 and 2. texts = ["This is a sample", ["This is the first input", "This is the second input"]] generation_configs = ( - GenerationConfig(max_new_tokens=4, num_beams=2, do_sample=True), - GenerationConfig(max_new_tokens=4, num_beams=4, do_sample=True), - GenerationConfig(max_new_tokens=4, num_beams=8, do_sample=True), - GenerationConfig(max_new_tokens=4, num_beams=32, do_sample=True), - GenerationConfig(max_new_tokens=4, do_sample=True, top_p=0.9, top_k=0), + GenerationConfig(max_new_tokens=4, num_beams=2, do_sample=False), + GenerationConfig(max_new_tokens=4, num_beams=4, do_sample=False), + GenerationConfig(max_new_tokens=4, num_beams=8, do_sample=False), + GenerationConfig(max_new_tokens=4, num_beams=32, do_sample=False), + GenerationConfig(max_new_tokens=4, do_sample=False, top_p=0.9, top_k=0), ) for text in texts: tokens = tokenizer(text, padding=True, return_tensors="pt") for generation_config in generation_configs: outputs = model.generate(**tokens, generation_config=generation_config) + transformers_outputs = trasnformers_model.generate(**tokens, generation_config=generation_config) self.assertIsInstance(outputs, torch.Tensor) + self.assertTrue(torch.equal(outputs, transformers_outputs)) def test_compare_with_and_without_past_key_values(self): model_id = "echarlaix/tiny-random-gpt2-torchscript" From c8e79698384ec441cb53219ca7be67040797670a Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Fri, 31 May 2024 05:27:11 -0400 Subject: [PATCH 20/49] add warning --- optimum/exporters/ipex/model_patcher.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py index 3996c2b23f..b1fe891b59 100644 --- a/optimum/exporters/ipex/model_patcher.py +++ b/optimum/exporters/ipex/model_patcher.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging + from transformers.models.llama.modeling_llama import ( LlamaAttention, LlamaDecoderLayer, @@ -30,6 +32,8 @@ ) +logger = logging.getLogger(__name__) + _IPEX_EXPORTED_ARCH = ("LlamaForCausalLM",) _IPEX_EXPORTED_TASK = ("text-generation",) @@ -65,6 +69,10 @@ def _patch_llama_model(model): if is_ipex_version("<", "2.3.0"): raise ImportError("Only ipex version >= 2.3.0 supports RotaryEmbedding and IndirectAccessKVCacheAttention") + logger.warning( + "Only greedy search and beam search with do_sample=True/False are verified for the patched model. It may have risk if other generation methods are applied" + ) + from intel_extension_for_pytorch.llm.modules import IndirectAccessKVCacheAttention, RotaryEmbedding ipex_rope = RotaryEmbedding( From afdc8d7ce7b83e543766277ba86ae3b2443207ad Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Fri, 31 May 2024 05:35:31 -0400 Subject: [PATCH 21/49] set pad_token_id --- tests/ipex/test_modeling.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index ff764a4ace..7ac63b94ee 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -258,7 +258,9 @@ def test_ipex_patching_beam_search(self, test_name, model_arch, use_cache): GenerationConfig(max_new_tokens=4, num_beams=4, do_sample=False), GenerationConfig(max_new_tokens=4, num_beams=8, do_sample=False), GenerationConfig(max_new_tokens=4, num_beams=32, do_sample=False), - GenerationConfig(max_new_tokens=4, do_sample=False, top_p=0.9, top_k=0), + GenerationConfig( + max_new_tokens=4, do_sample=False, top_p=0.9, top_k=0, pad_token_id=tokenizer.eos_token_id + ), ) for text in texts: tokens = tokenizer(text, padding=True, return_tensors="pt") From 1d1df34dade8646038262eac0a21c2640d1660c5 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Mon, 3 Jun 2024 06:30:24 -0400 Subject: [PATCH 22/49] limited transformers --- optimum/exporters/ipex/modeling_utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py index 3553062834..d6b79828d8 100644 --- a/optimum/exporters/ipex/modeling_utils.py +++ b/optimum/exporters/ipex/modeling_utils.py @@ -228,9 +228,7 @@ def __init__(self, module, config, distributed=False): if is_ipex_version("<", "2.3.0"): raise ImportError("Only ipex version > 2.3.0 supports Linear2SiluMul and LinearAdd") if is_transformers_version("<", "4.38.2") or is_transformers_version(">", "4.41.2"): - logger.warning( - "The verified transformers version is 4.38.2 ~ 4.41.2. It may cause unexpected error if out of this interval" - ) + raise ImportError("Only transformers versions 4.38.2 ~ 4.41.2 are verified.") from intel_extension_for_pytorch.llm.modules import Linear2SiluMul, LinearAdd From aaaa4c360e008a13c648537951db5928c5a2dbba Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Mon, 3 Jun 2024 12:31:46 -0400 Subject: [PATCH 23/49] fix transformers version --- optimum/exporters/ipex/modeling_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py index d6b79828d8..de41e914f3 100644 --- a/optimum/exporters/ipex/modeling_utils.py +++ b/optimum/exporters/ipex/modeling_utils.py @@ -227,7 +227,7 @@ class _IPEXLlamaDecoderLayerRef(nn.Module): def __init__(self, module, config, distributed=False): if is_ipex_version("<", "2.3.0"): raise ImportError("Only ipex version > 2.3.0 supports Linear2SiluMul and LinearAdd") - if is_transformers_version("<", "4.38.2") or is_transformers_version(">", "4.41.2"): + if is_transformers_version("<", "4.38.2") or is_transformers_version(">=", "4.39.0"): raise ImportError("Only transformers versions 4.38.2 ~ 4.41.2 are verified.") from intel_extension_for_pytorch.llm.modules import Linear2SiluMul, LinearAdd From f6b8010425be42e1bd001c8ca26eb35a48c4ced0 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Tue, 4 Jun 2024 12:15:19 -0400 Subject: [PATCH 24/49] update transformers version --- optimum/exporters/ipex/modeling_utils.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py index de41e914f3..55afc3abf1 100644 --- a/optimum/exporters/ipex/modeling_utils.py +++ b/optimum/exporters/ipex/modeling_utils.py @@ -227,7 +227,7 @@ class _IPEXLlamaDecoderLayerRef(nn.Module): def __init__(self, module, config, distributed=False): if is_ipex_version("<", "2.3.0"): raise ImportError("Only ipex version > 2.3.0 supports Linear2SiluMul and LinearAdd") - if is_transformers_version("<", "4.38.2") or is_transformers_version(">=", "4.39.0"): + if is_transformers_version("<", "4.38.2") or is_transformers_version(">=", "4.41.2"): raise ImportError("Only transformers versions 4.38.2 ~ 4.41.2 are verified.") from intel_extension_for_pytorch.llm.modules import Linear2SiluMul, LinearAdd diff --git a/setup.py b/setup.py index 251ec61cdd..134fe0d238 100644 --- a/setup.py +++ b/setup.py @@ -62,7 +62,7 @@ "neural-compressor": ["neural-compressor>=2.2.0", "onnxruntime<1.15.0", "accelerate"], "openvino": ["openvino>=2023.3", "nncf>=2.10.0", "openvino-tokenizers[transformers]"], "nncf": ["nncf>=2.10.0"], - "ipex": ["intel-extension-for-pytorch", "transformers>=4.36.0,<4.39.0"], + "ipex": ["intel-extension-for-pytorch", "transformers>=4.36.0,<=4.41.2"], "diffusers": ["diffusers"], "quality": QUALITY_REQUIRE, "tests": TESTS_REQUIRE, From 51e47b654881c42ed40af1a9db868e91376e827b Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Tue, 4 Jun 2024 12:38:39 -0400 Subject: [PATCH 25/49] fix version --- optimum/exporters/ipex/modeling_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py index 55afc3abf1..d6b79828d8 100644 --- a/optimum/exporters/ipex/modeling_utils.py +++ b/optimum/exporters/ipex/modeling_utils.py @@ -227,7 +227,7 @@ class _IPEXLlamaDecoderLayerRef(nn.Module): def __init__(self, module, config, distributed=False): if is_ipex_version("<", "2.3.0"): raise ImportError("Only ipex version > 2.3.0 supports Linear2SiluMul and LinearAdd") - if is_transformers_version("<", "4.38.2") or is_transformers_version(">=", "4.41.2"): + if is_transformers_version("<", "4.38.2") or is_transformers_version(">", "4.41.2"): raise ImportError("Only transformers versions 4.38.2 ~ 4.41.2 are verified.") from intel_extension_for_pytorch.llm.modules import Linear2SiluMul, LinearAdd From 5204b2471cb309ab04fce5c327f0c6c3397200e3 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Tue, 4 Jun 2024 14:02:50 -0400 Subject: [PATCH 26/49] temporary fix for multi-query model --- optimum/intel/ipex/modeling_base.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index b8988f2db0..69015e6bbd 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -522,6 +522,23 @@ def _prepare_past_key_values(self, input_ids): past_key_values = tuple(tuple(pkv for _ in range(nb_pkv)) for _ in range(num_layers)) return past_key_values + + # Temporary fix. + def _get_initial_cache_position(self, input_ids, model_kwargs): + """Calculates `cache_position` for the pre-fill stage based on `input_ids` and optionally past length""" + if not model_kwargs.get("use_cache", True): + model_kwargs["cache_position"] = None + return model_kwargs + + past_length = 0 + if "past_key_values" in model_kwargs: + past_length = model_kwargs["past_key_values"][0][0].shape[-2] + if "inputs_embeds" in model_kwargs: + cur_len = model_kwargs["inputs_embeds"].shape[1] + else: + cur_len = input_ids.shape[-1] + model_kwargs["cache_position"] = torch.arange(past_length, cur_len, device=input_ids.device) + return model_kwargs def forward( self, From 8f2f02513d7ac014877fb72d8b6467f49b46f7a5 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Tue, 4 Jun 2024 14:13:13 -0400 Subject: [PATCH 27/49] fix code styke --- optimum/intel/ipex/modeling_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index 69015e6bbd..fa4b050f22 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -522,7 +522,7 @@ def _prepare_past_key_values(self, input_ids): past_key_values = tuple(tuple(pkv for _ in range(nb_pkv)) for _ in range(num_layers)) return past_key_values - + # Temporary fix. def _get_initial_cache_position(self, input_ids, model_kwargs): """Calculates `cache_position` for the pre-fill stage based on `input_ids` and optionally past length""" From 8dc5ad51f0b680d707b84d948f6cdc6486868fe7 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Tue, 4 Jun 2024 14:16:39 -0400 Subject: [PATCH 28/49] add transformers version tests --- .github/workflows/test_ipex.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml index 8e02bd5510..dcc1fd0048 100644 --- a/.github/workflows/test_ipex.yml +++ b/.github/workflows/test_ipex.yml @@ -18,6 +18,7 @@ jobs: fail-fast: false matrix: python-version: [3.8, 3.9] + transformers-version: [4.38.2, 4.41.2] os: [ubuntu-latest] runs-on: ${{ matrix.os }} From d366b80b6e359b69a10af2bce90541e9661f4119 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Wed, 5 Jun 2024 07:01:55 -0400 Subject: [PATCH 29/49] check geenration method --- optimum/exporters/ipex/model_patcher.py | 1 + optimum/intel/ipex/modeling_base.py | 18 +++++++++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py index b1fe891b59..84fade5fc8 100644 --- a/optimum/exporters/ipex/model_patcher.py +++ b/optimum/exporters/ipex/model_patcher.py @@ -36,6 +36,7 @@ _IPEX_EXPORTED_ARCH = ("LlamaForCausalLM",) _IPEX_EXPORTED_TASK = ("text-generation",) +_IPEX_EXPORTED_GENERATION_METHODS = ("sample", "greedy_search", "beam_sample", "beam_search") def convert_func(m, func_name, new_function): diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index fa4b050f22..3420b3358c 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -18,7 +18,7 @@ import warnings from pathlib import Path from tempfile import TemporaryDirectory -from typing import Optional, Tuple, Union +from typing import Dict, Optional, Tuple, Union import intel_extension_for_pytorch as ipex import torch @@ -50,7 +50,7 @@ from optimum.modeling_base import OptimizedModel from optimum.utils import NormalizedConfigManager -from ...exporters.ipex.model_patcher import _IPEX_EXPORTED_TASK, _patch_model +from ...exporters.ipex.model_patcher import _IPEX_EXPORTED_GENERATION_METHODS, _IPEX_EXPORTED_TASK, _patch_model from ..generation.modeling import prepare_jit_inputs from ..utils.import_utils import is_ipex_version, is_torch_version, is_transformers_version from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS, patch_decoder_attention_mask, recursive_to_device @@ -523,7 +523,7 @@ def _prepare_past_key_values(self, input_ids): return past_key_values - # Temporary fix. + # Temporary fix, will delete when https://github.com/huggingface/transformers/pull/31226 release. def _get_initial_cache_position(self, input_ids, model_kwargs): """Calculates `cache_position` for the pre-fill stage based on `input_ids` and optionally past length""" if not model_kwargs.get("use_cache", True): @@ -579,6 +579,18 @@ def forward( return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values) + def _prepare_generation_config( + self, generation_config: Optional[GenerationConfig], **kwargs: Dict + ) -> Tuple[GenerationConfig, Dict]: + generation_config, model_kwargs = self.model_cls._prepare_generation_config(self, generation_config, **kwargs) + generation_method = generation_config.get_generation_mode(kwargs.get("assistant_model", None)).value + if self._is_ipex_exported and generation_method not in _IPEX_EXPORTED_GENERATION_METHODS: + raise ValueError( + f"The generation method {generation_method} is not supported for patched models for now, support methods are {_IPEX_EXPORTED_GENERATION_METHODS}" + ) + + return generation_config, model_kwargs + def _prepare_inputs_for_generation_for_llama( input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs From 3948cad1907389e62fa82b7e44b370df8ae63ddc Mon Sep 17 00:00:00 2001 From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com> Date: Wed, 5 Jun 2024 11:10:14 +0800 Subject: [PATCH 30/49] Update optimum/intel/ipex/modeling_base.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- optimum/intel/ipex/modeling_base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index 3420b3358c..f49ec27684 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -95,7 +95,8 @@ def ipex_jit_trace(model, task, use_cache): model.config.return_dict = False - model.config.use_cache = "past_key_values" in sample_inputs + if "past_key_values" in sample_inputs: + model.config.use_cache = True model = ipex.optimize(model.eval(), dtype=model.dtype, inplace=True) # Disable repack while jit tracing to reduce the memory From e482e586256a271c1abef1b93ca9a0e935ed3920 Mon Sep 17 00:00:00 2001 From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com> Date: Wed, 5 Jun 2024 11:30:03 +0800 Subject: [PATCH 31/49] Update .github/workflows/test_ipex.yml Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- .github/workflows/test_ipex.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml index dcc1fd0048..779c54be1b 100644 --- a/.github/workflows/test_ipex.yml +++ b/.github/workflows/test_ipex.yml @@ -18,7 +18,7 @@ jobs: fail-fast: false matrix: python-version: [3.8, 3.9] - transformers-version: [4.38.2, 4.41.2] + transformers-version: [4.38.0, 4.41.2] os: [ubuntu-latest] runs-on: ${{ matrix.os }} From d1b63ef4c7007279ed307539e1001874dd18e540 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Wed, 5 Jun 2024 07:57:56 -0400 Subject: [PATCH 32/49] fix use_cache --- optimum/intel/ipex/modeling_base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index f49ec27684..b723122799 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -85,8 +85,6 @@ def ipex_jit_trace(model, task, use_cache): model = _patch_model(model) # Todo: integerate in prepare_jit_inputs. sample_inputs = get_dummy_input(model, return_dict=True) - if not use_cache: - sample_inputs.pop("past_key_values") # Use Tensor Processing Primitives to accelerate linear, see https://arxiv.org/abs/2104.05755. _enable_tpp() else: @@ -96,7 +94,9 @@ def ipex_jit_trace(model, task, use_cache): model.config.return_dict = False if "past_key_values" in sample_inputs: - model.config.use_cache = True + model.config.use_cache = use_cache + if not use_cache: + sample_inputs.pop("past_key_values") model = ipex.optimize(model.eval(), dtype=model.dtype, inplace=True) # Disable repack while jit tracing to reduce the memory From ea4d3e28b8ecb48f8f47de494922bcb7f8b13170 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Wed, 5 Jun 2024 10:13:17 -0400 Subject: [PATCH 33/49] add hidden size limitation for patch --- optimum/intel/ipex/modeling_base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index b723122799..2f26b250da 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -73,6 +73,9 @@ def _is_patched_with_ipex(model, task): return True return False else: + # The ipex IAKV op in patched model requires the hidden size at least 64 + if model.config.hidden_size < 64: + return False return model.config.model_type in _IPEX_SUPPORT_MODEL_TYPES and task in _IPEX_EXPORTED_TASK From bcb2b5aacd956d9191f66f37d9f40a9bcce6c0de Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Wed, 5 Jun 2024 10:15:02 -0400 Subject: [PATCH 34/49] add llama in tests --- tests/ipex/test_modeling.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index 7ac63b94ee..0d0ff16855 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -172,6 +172,7 @@ class IPEXModelForCausalLMTest(unittest.TestCase): "gpt_neo", "gpt_neox", "mistral", + "llama", # "phi", "mpt", "opt", From f5f1af82e8d4deea0bee8c1131c0ddcb3ca89276 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Wed, 5 Jun 2024 10:57:08 -0400 Subject: [PATCH 35/49] add re-load tests --- tests/ipex/test_modeling.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index 0d0ff16855..74088cfe54 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -14,6 +14,7 @@ # ruff: noqa +import tempfile import time import unittest @@ -87,10 +88,16 @@ def test_compare_to_transformers(self, model_arch): with torch.no_grad(): transformers_outputs = transformers_model(**tokens) outputs = ipex_model(**tokens) + + with tempfile.TemporaryDirectory() as tmpdirname: + ipex_model.save_pretrained(tmpdirname) + loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname) + loaded_model_outputs = loaded_model(**tokens) # Compare tensor outputs for output_name in {"logits", "last_hidden_state"}: if output_name in transformers_outputs: self.assertTrue(torch.allclose(outputs[output_name], transformers_outputs[output_name], atol=1e-4)) + self.assertTrue(torch.equal(outputs[output_name], loaded_model_outputs[output_name])) @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_pipeline(self, model_arch): @@ -139,11 +146,19 @@ def test_compare_to_transformers(self, model_arch): with torch.no_grad(): transformers_outputs = transformers_model(**tokens) outputs = ipex_model(**tokens) + + with tempfile.TemporaryDirectory() as tmpdirname: + ipex_model.save_pretrained(tmpdirname) + loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname) + loaded_model_outputs = loaded_model(**tokens) + self.assertIn("start_logits", outputs) self.assertIn("end_logits", outputs) # Compare tensor outputs self.assertTrue(torch.allclose(outputs.start_logits, transformers_outputs.start_logits, atol=1e-4)) self.assertTrue(torch.allclose(outputs.end_logits, transformers_outputs.end_logits, atol=1e-4)) + self.assertTrue(torch.equal(outputs.start_logits, loaded_model_outputs.start_logits)) + self.assertTrue(torch.equal(outputs.end_logits, loaded_model_outputs.end_logits)) @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_pipeline(self, model_arch): @@ -203,8 +218,14 @@ def test_compare_to_transformers(self, model_arch): transformers_model = AutoModelForCausalLM.from_pretrained(model_id) with torch.no_grad(): transformers_outputs = transformers_model(**tokens) + + with tempfile.TemporaryDirectory() as tmpdirname: + ipex_model.save_pretrained(tmpdirname) + loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname) + loaded_model_outputs = loaded_model(**tokens) # Compare tensor outputs self.assertTrue(torch.allclose(outputs.logits, transformers_outputs.logits, atol=1e-4)) + self.assertTrue(torch.equal(outputs.logits, loaded_model_outputs.logits)) @parameterized.expand(SUPPORTED_ARCHITECTURES + IPEX_PATCHED_SUPPORTED_ARCHITECTURES) def test_pipeline(self, model_arch): @@ -327,8 +348,14 @@ def test_compare_to_transformers(self, model_arch): with torch.no_grad(): transformers_outputs = transformers_model(**inputs) outputs = ipex_model(**inputs) + + with tempfile.TemporaryDirectory() as tmpdirname: + ipex_model.save_pretrained(tmpdirname) + loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname) + loaded_model_outputs = loaded_model(**inputs) # Compare tensor outputs self.assertTrue(torch.allclose(outputs.logits, transformers_outputs.logits, atol=1e-3)) + self.assertTrue(torch.equal(outputs.logits, loaded_model_outputs.logits)) @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_pipeline(self, model_arch): @@ -367,9 +394,16 @@ def test_compare_to_transformers(self, model_arch): with torch.no_grad(): transformers_outputs = transformers_model(**inputs) outputs = ipex_model(**inputs) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname) + loaded_model_outputs = loaded_model(**inputs) + self.assertIn("logits", outputs) # Compare tensor outputs self.assertTrue(torch.allclose(outputs.logits, transformers_outputs.logits, atol=1e-4)) + self.assertTrue(torch.equal(outputs.logits, loaded_model_outputs.logits)) @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_pipeline(self, model_arch): From c08c95719fa48ebe24b8b45c12c95819dd61b5c7 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Wed, 5 Jun 2024 11:14:42 -0400 Subject: [PATCH 36/49] fix hidden size check --- optimum/intel/ipex/modeling_base.py | 9 ++++++--- tests/ipex/test_modeling.py | 4 ++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index 2f26b250da..ecfc210ff6 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -73,10 +73,13 @@ def _is_patched_with_ipex(model, task): return True return False else: + norm_config = NormalizedConfigManager.get_normalized_config_class(model.config.model_type)(model.config) # The ipex IAKV op in patched model requires the hidden size at least 64 - if model.config.hidden_size < 64: - return False - return model.config.model_type in _IPEX_SUPPORT_MODEL_TYPES and task in _IPEX_EXPORTED_TASK + return ( + model.config.model_type in _IPEX_SUPPORT_MODEL_TYPES + and task in _IPEX_EXPORTED_TASK + and norm_config.hidden_size >= 64 + ) def ipex_jit_trace(model, task, use_cache): diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index 74088cfe54..f9e6e97aa9 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -222,7 +222,7 @@ def test_compare_to_transformers(self, model_arch): with tempfile.TemporaryDirectory() as tmpdirname: ipex_model.save_pretrained(tmpdirname) loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname) - loaded_model_outputs = loaded_model(**tokens) + loaded_model_outputs = loaded_model(**inputs) # Compare tensor outputs self.assertTrue(torch.allclose(outputs.logits, transformers_outputs.logits, atol=1e-4)) self.assertTrue(torch.equal(outputs.logits, loaded_model_outputs.logits)) @@ -396,7 +396,7 @@ def test_compare_to_transformers(self, model_arch): outputs = ipex_model(**inputs) with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) + ipex_model.save_pretrained(tmpdirname) loaded_model = self.IPEX_MODEL_CLASS.from_pretrained(tmpdirname) loaded_model_outputs = loaded_model(**inputs) From 51e6f3de508043bdbea0da786ee98467d5c27212 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Wed, 5 Jun 2024 11:20:22 -0400 Subject: [PATCH 37/49] rm norm config --- optimum/intel/ipex/modeling_base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index ecfc210ff6..5e697a12f0 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -73,12 +73,11 @@ def _is_patched_with_ipex(model, task): return True return False else: - norm_config = NormalizedConfigManager.get_normalized_config_class(model.config.model_type)(model.config) # The ipex IAKV op in patched model requires the hidden size at least 64 return ( model.config.model_type in _IPEX_SUPPORT_MODEL_TYPES and task in _IPEX_EXPORTED_TASK - and norm_config.hidden_size >= 64 + and model.config.hidden_size >= 64 ) From d06123b01cf57025a913800cc0d5014e4a51ec3c Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Wed, 5 Jun 2024 11:43:59 -0400 Subject: [PATCH 38/49] add version variable --- optimum/exporters/ipex/model_patcher.py | 10 ++++++++-- optimum/exporters/ipex/modeling_utils.py | 16 ++++++++++++---- optimum/intel/ipex/modeling_base.py | 9 +++++++-- setup.py | 2 +- 4 files changed, 28 insertions(+), 9 deletions(-) diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py index 84fade5fc8..e08c855301 100644 --- a/optimum/exporters/ipex/model_patcher.py +++ b/optimum/exporters/ipex/model_patcher.py @@ -34,6 +34,10 @@ logger = logging.getLogger(__name__) +# Please also update in the setup.py and .github/workflows/test_ipex.yml if you change the transformers version +_TRANSFORMERS_MIN_VERSION = "4.38.0" +_TRANSFORMERS_MAX_VERSION = "4.41.2" +_IPEX_MINIMUM_VERSION_FOR_PATCHING = "2.3.0" _IPEX_EXPORTED_ARCH = ("LlamaForCausalLM",) _IPEX_EXPORTED_TASK = ("text-generation",) _IPEX_EXPORTED_GENERATION_METHODS = ("sample", "greedy_search", "beam_sample", "beam_search") @@ -67,8 +71,10 @@ def patch_op(m, target_m, new_op_name, new_op): def _patch_llama_model(model): - if is_ipex_version("<", "2.3.0"): - raise ImportError("Only ipex version >= 2.3.0 supports RotaryEmbedding and IndirectAccessKVCacheAttention") + if is_ipex_version("<", _IPEX_MINIMUM_VERSION_FOR_PATCHING): + raise ImportError( + f"Only ipex version >= {_IPEX_MINIMUM_VERSION_FOR_PATCHING} supports RotaryEmbedding and IndirectAccessKVCacheAttention" + ) logger.warning( "Only greedy search and beam search with do_sample=True/False are verified for the patched model. It may have risk if other generation methods are applied" diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py index d6b79828d8..dbf39dc20a 100644 --- a/optimum/exporters/ipex/modeling_utils.py +++ b/optimum/exporters/ipex/modeling_utils.py @@ -24,6 +24,8 @@ from optimum.intel.utils.import_utils import is_ipex_version, is_transformers_version +from .model_patcher import _IPEX_MINIMUM_VERSION_FOR_PATCHING, _TRANSFORMERS_MAX_VERSION, _TRANSFORMERS_MIN_VERSION + logger = logging.getLogger(__name__) @@ -225,10 +227,16 @@ def _llama_model_forward( # Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L694 class _IPEXLlamaDecoderLayerRef(nn.Module): def __init__(self, module, config, distributed=False): - if is_ipex_version("<", "2.3.0"): - raise ImportError("Only ipex version > 2.3.0 supports Linear2SiluMul and LinearAdd") - if is_transformers_version("<", "4.38.2") or is_transformers_version(">", "4.41.2"): - raise ImportError("Only transformers versions 4.38.2 ~ 4.41.2 are verified.") + if is_ipex_version("<", _IPEX_MINIMUM_VERSION_FOR_PATCHING): + raise ImportError( + f"Only ipex version > {_IPEX_MINIMUM_VERSION_FOR_PATCHING} supports Linear2SiluMul and LinearAdd" + ) + if is_transformers_version("<", _TRANSFORMERS_MIN_VERSION) or is_transformers_version( + ">", _TRANSFORMERS_MAX_VERSION + ): + raise ImportError( + f"Only transformers versions {_TRANSFORMERS_MIN_VERSION} ~ {_TRANSFORMERS_MAX_VERSION} are verified." + ) from intel_extension_for_pytorch.llm.modules import Linear2SiluMul, LinearAdd diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index 5e697a12f0..10adf236b6 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -50,7 +50,12 @@ from optimum.modeling_base import OptimizedModel from optimum.utils import NormalizedConfigManager -from ...exporters.ipex.model_patcher import _IPEX_EXPORTED_GENERATION_METHODS, _IPEX_EXPORTED_TASK, _patch_model +from ...exporters.ipex.model_patcher import ( + _IPEX_EXPORTED_GENERATION_METHODS, + _IPEX_EXPORTED_TASK, + _IPEX_MINIMUM_VERSION_FOR_PATCHING, + _patch_model, +) from ..generation.modeling import prepare_jit_inputs from ..utils.import_utils import is_ipex_version, is_torch_version, is_transformers_version from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS, patch_decoder_attention_mask, recursive_to_device @@ -63,7 +68,7 @@ def _is_patched_with_ipex(model, task): - if is_ipex_version("<", "2.3.0"): + if is_ipex_version("<", _IPEX_MINIMUM_VERSION_FOR_PATCHING): return False if isinstance(model, torch.jit.ScriptModule): diff --git a/setup.py b/setup.py index 134fe0d238..d0003a287b 100644 --- a/setup.py +++ b/setup.py @@ -62,7 +62,7 @@ "neural-compressor": ["neural-compressor>=2.2.0", "onnxruntime<1.15.0", "accelerate"], "openvino": ["openvino>=2023.3", "nncf>=2.10.0", "openvino-tokenizers[transformers]"], "nncf": ["nncf>=2.10.0"], - "ipex": ["intel-extension-for-pytorch", "transformers>=4.36.0,<=4.41.2"], + "ipex": ["intel-extension-for-pytorch", "transformers>=4.38.0,<=4.41.2"], "diffusers": ["diffusers"], "quality": QUALITY_REQUIRE, "tests": TESTS_REQUIRE, From 641e8f9bf970ac1a9cdb1296d69cbe4c479245af Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Wed, 5 Jun 2024 12:21:32 -0400 Subject: [PATCH 39/49] fix import --- optimum/exporters/ipex/model_patcher.py | 5 +---- optimum/exporters/ipex/modeling_utils.py | 7 +++++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py index e08c855301..d684d52be8 100644 --- a/optimum/exporters/ipex/model_patcher.py +++ b/optimum/exporters/ipex/model_patcher.py @@ -25,6 +25,7 @@ from optimum.intel.utils.import_utils import is_ipex_version from .modeling_utils import ( + _IPEX_MINIMUM_VERSION_FOR_PATCHING, _IPEXLlamaDecoderLayerRef, _llama_attn_forward, _llama_layer_norm_forward, @@ -34,10 +35,6 @@ logger = logging.getLogger(__name__) -# Please also update in the setup.py and .github/workflows/test_ipex.yml if you change the transformers version -_TRANSFORMERS_MIN_VERSION = "4.38.0" -_TRANSFORMERS_MAX_VERSION = "4.41.2" -_IPEX_MINIMUM_VERSION_FOR_PATCHING = "2.3.0" _IPEX_EXPORTED_ARCH = ("LlamaForCausalLM",) _IPEX_EXPORTED_TASK = ("text-generation",) _IPEX_EXPORTED_GENERATION_METHODS = ("sample", "greedy_search", "beam_sample", "beam_search") diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py index dbf39dc20a..160c9aa7a3 100644 --- a/optimum/exporters/ipex/modeling_utils.py +++ b/optimum/exporters/ipex/modeling_utils.py @@ -24,11 +24,14 @@ from optimum.intel.utils.import_utils import is_ipex_version, is_transformers_version -from .model_patcher import _IPEX_MINIMUM_VERSION_FOR_PATCHING, _TRANSFORMERS_MAX_VERSION, _TRANSFORMERS_MIN_VERSION - logger = logging.getLogger(__name__) +# Please also update in the setup.py and .github/workflows/test_ipex.yml if you change the transformers version +_TRANSFORMERS_MIN_VERSION = "4.38.0" +_TRANSFORMERS_MAX_VERSION = "4.41.2" +_IPEX_MINIMUM_VERSION_FOR_PATCHING = "2.3.0" + # Adapted from https://github.com/huggingface/transformers/blob/v4.38.2/src/transformers/models/llama/modeling_llama.py#L83 def _llama_layer_norm_forward(self, hidden_states): From 50c1059df94dc87ec8c4f10c11e800970e20f1ae Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Wed, 5 Jun 2024 12:25:09 -0400 Subject: [PATCH 40/49] rm useless logger --- optimum/exporters/ipex/model_patcher.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py index d684d52be8..2e80ece1bd 100644 --- a/optimum/exporters/ipex/model_patcher.py +++ b/optimum/exporters/ipex/model_patcher.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import logging - from transformers.models.llama.modeling_llama import ( LlamaAttention, LlamaDecoderLayer, @@ -33,8 +31,6 @@ ) -logger = logging.getLogger(__name__) - _IPEX_EXPORTED_ARCH = ("LlamaForCausalLM",) _IPEX_EXPORTED_TASK = ("text-generation",) _IPEX_EXPORTED_GENERATION_METHODS = ("sample", "greedy_search", "beam_sample", "beam_search") @@ -73,10 +69,6 @@ def _patch_llama_model(model): f"Only ipex version >= {_IPEX_MINIMUM_VERSION_FOR_PATCHING} supports RotaryEmbedding and IndirectAccessKVCacheAttention" ) - logger.warning( - "Only greedy search and beam search with do_sample=True/False are verified for the patched model. It may have risk if other generation methods are applied" - ) - from intel_extension_for_pytorch.llm.modules import IndirectAccessKVCacheAttention, RotaryEmbedding ipex_rope = RotaryEmbedding( From a9617467fd0136edb9d73fe697cdab82b2e94e1a Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Wed, 5 Jun 2024 12:27:07 -0400 Subject: [PATCH 41/49] rm useless logging --- optimum/exporters/ipex/modeling_utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py index 160c9aa7a3..c46da2037b 100644 --- a/optimum/exporters/ipex/modeling_utils.py +++ b/optimum/exporters/ipex/modeling_utils.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import logging import math from typing import List, Optional, Tuple, Union @@ -25,8 +24,6 @@ from optimum.intel.utils.import_utils import is_ipex_version, is_transformers_version -logger = logging.getLogger(__name__) - # Please also update in the setup.py and .github/workflows/test_ipex.yml if you change the transformers version _TRANSFORMERS_MIN_VERSION = "4.38.0" _TRANSFORMERS_MAX_VERSION = "4.41.2" From c2253a84bdc7354a2843897003729bd66a9f390c Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Thu, 6 Jun 2024 05:40:52 -0400 Subject: [PATCH 42/49] fix last round review --- .github/workflows/test_ipex.yml | 1 + optimum/exporters/ipex/model_patcher.py | 1 - optimum/intel/ipex/modeling_base.py | 23 +++++++++++++---------- tests/ipex/test_modeling.py | 7 +++++-- 4 files changed, 19 insertions(+), 13 deletions(-) diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml index 779c54be1b..030f6af850 100644 --- a/.github/workflows/test_ipex.yml +++ b/.github/workflows/test_ipex.yml @@ -33,6 +33,7 @@ jobs: python -m pip install --upgrade pip pip install torch torchaudio torchvision --extra-index-url https://download.pytorch.org/whl/cpu pip install .[ipex,tests] + pip install transformers==${{ matrix.transformers-version }} - name: Test with Pytest run: | pytest tests/ipex/ diff --git a/optimum/exporters/ipex/model_patcher.py b/optimum/exporters/ipex/model_patcher.py index 2e80ece1bd..0d87a5fd6c 100644 --- a/optimum/exporters/ipex/model_patcher.py +++ b/optimum/exporters/ipex/model_patcher.py @@ -33,7 +33,6 @@ _IPEX_EXPORTED_ARCH = ("LlamaForCausalLM",) _IPEX_EXPORTED_TASK = ("text-generation",) -_IPEX_EXPORTED_GENERATION_METHODS = ("sample", "greedy_search", "beam_sample", "beam_search") def convert_func(m, func_name, new_function): diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index 10adf236b6..da95dc217e 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -50,12 +50,7 @@ from optimum.modeling_base import OptimizedModel from optimum.utils import NormalizedConfigManager -from ...exporters.ipex.model_patcher import ( - _IPEX_EXPORTED_GENERATION_METHODS, - _IPEX_EXPORTED_TASK, - _IPEX_MINIMUM_VERSION_FOR_PATCHING, - _patch_model, -) +from ...exporters.ipex.model_patcher import _IPEX_EXPORTED_TASK, _IPEX_MINIMUM_VERSION_FOR_PATCHING, _patch_model from ..generation.modeling import prepare_jit_inputs from ..utils.import_utils import is_ipex_version, is_torch_version, is_transformers_version from ..utils.modeling_utils import MULTI_QUERY_ATTN_MODELS, patch_decoder_attention_mask, recursive_to_device @@ -65,6 +60,7 @@ _IPEX_SUPPORT_MODEL_TYPES = ("llama",) +_IPEX_EXPORTED_GENERATION_METHODS = ("sample", "greedy_search", "beam_sample", "beam_search", "assisted_generation") def _is_patched_with_ipex(model, task): @@ -593,15 +589,22 @@ def forward( def _prepare_generation_config( self, generation_config: Optional[GenerationConfig], **kwargs: Dict ) -> Tuple[GenerationConfig, Dict]: - generation_config, model_kwargs = self.model_cls._prepare_generation_config(self, generation_config, **kwargs) - generation_method = generation_config.get_generation_mode(kwargs.get("assistant_model", None)).value - if self._is_ipex_exported and generation_method not in _IPEX_EXPORTED_GENERATION_METHODS: + generation_config, model_kwargs = super()._prepare_generation_config(generation_config, **kwargs) + generation_method = generation_config.get_generation_mode().value + if generation_method not in _IPEX_EXPORTED_GENERATION_METHODS: raise ValueError( - f"The generation method {generation_method} is not supported for patched models for now, support methods are {_IPEX_EXPORTED_GENERATION_METHODS}" + f"The generation method {generation_method} is not supported for IPEXModelForCausalLM for now, support methods are {_IPEX_EXPORTED_GENERATION_METHODS}" ) return generation_config, model_kwargs + def generate(self, **kwargs): + if self._is_ipex_exported and kwargs.get("assistant_model", None): + raise ValueError( + f"Assisted decoding is not supported for patched models for now, support methods are {_IPEX_EXPORTED_GENERATION_METHODS}" + ) + return super().generate(**kwargs) + def _prepare_inputs_for_generation_for_llama( input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index f9e6e97aa9..3f794771e7 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -188,6 +188,7 @@ class IPEXModelForCausalLMTest(unittest.TestCase): "gpt_neox", "mistral", "llama", + "llama2", # "phi", "mpt", "opt", @@ -196,7 +197,7 @@ class IPEXModelForCausalLMTest(unittest.TestCase): GENERATION_LENGTH = 100 SPEEDUP_CACHE = 1.0 - @parameterized.expand(SUPPORTED_ARCHITECTURES + IPEX_PATCHED_SUPPORTED_ARCHITECTURES) + @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_compare_to_transformers(self, model_arch): model_id = MODEL_NAMES[model_arch] set_seed(SEED) @@ -227,7 +228,7 @@ def test_compare_to_transformers(self, model_arch): self.assertTrue(torch.allclose(outputs.logits, transformers_outputs.logits, atol=1e-4)) self.assertTrue(torch.equal(outputs.logits, loaded_model_outputs.logits)) - @parameterized.expand(SUPPORTED_ARCHITECTURES + IPEX_PATCHED_SUPPORTED_ARCHITECTURES) + @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_pipeline(self, model_arch): model_id = MODEL_NAMES[model_arch] tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -242,6 +243,8 @@ def test_pipeline(self, model_arch): # High optimized model llama is not supported assisted decoding for now. @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_assisted_decoding(self, model_arch): + if model_arch == "llama2": + return model_id = MODEL_NAMES[model_arch] tokenizer = AutoTokenizer.from_pretrained(model_id) ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, export=True) From caa27c3dd8e42f522578dbc97e7afccda2721daf Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 6 Jun 2024 14:06:07 +0200 Subject: [PATCH 43/49] Update .github/workflows/test_ipex.yml --- .github/workflows/test_ipex.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_ipex.yml b/.github/workflows/test_ipex.yml index 030f6af850..7bb8947ab7 100644 --- a/.github/workflows/test_ipex.yml +++ b/.github/workflows/test_ipex.yml @@ -18,7 +18,7 @@ jobs: fail-fast: false matrix: python-version: [3.8, 3.9] - transformers-version: [4.38.0, 4.41.2] + transformers-version: [4.39.0, 4.41.2] os: [ubuntu-latest] runs-on: ${{ matrix.os }} From 78498ab21106ccc94142c442cbd438e8c61cd0f3 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 6 Jun 2024 14:06:15 +0200 Subject: [PATCH 44/49] Update optimum/intel/ipex/modeling_base.py --- optimum/intel/ipex/modeling_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index da95dc217e..990880c8ec 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -598,7 +598,7 @@ def _prepare_generation_config( return generation_config, model_kwargs - def generate(self, **kwargs): + def generate(self, *args, **kwargs): if self._is_ipex_exported and kwargs.get("assistant_model", None): raise ValueError( f"Assisted decoding is not supported for patched models for now, support methods are {_IPEX_EXPORTED_GENERATION_METHODS}" From 97f78768a98432fe4119b8f4842150dfc47c27c1 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 6 Jun 2024 14:06:21 +0200 Subject: [PATCH 45/49] Update optimum/intel/ipex/modeling_base.py --- optimum/intel/ipex/modeling_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/intel/ipex/modeling_base.py b/optimum/intel/ipex/modeling_base.py index 990880c8ec..3750d56227 100644 --- a/optimum/intel/ipex/modeling_base.py +++ b/optimum/intel/ipex/modeling_base.py @@ -603,7 +603,7 @@ def generate(self, *args, **kwargs): raise ValueError( f"Assisted decoding is not supported for patched models for now, support methods are {_IPEX_EXPORTED_GENERATION_METHODS}" ) - return super().generate(**kwargs) + return super().generate(*args, **kwargs) def _prepare_inputs_for_generation_for_llama( From cf3525a5db6fb028230969b1fea6539464a29a18 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 6 Jun 2024 14:06:28 +0200 Subject: [PATCH 46/49] Update setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 446b0f117f..cfb28db878 100644 --- a/setup.py +++ b/setup.py @@ -62,7 +62,7 @@ "neural-compressor": ["neural-compressor>=2.2.0", "onnxruntime<1.15.0", "accelerate"], "openvino": ["openvino>=2023.3", "nncf>=2.10.0", "openvino-tokenizers[transformers]"], "nncf": ["nncf>=2.10.0"], - "ipex": ["intel-extension-for-pytorch", "transformers>=4.38.0,<=4.41.2"], + "ipex": ["intel-extension-for-pytorch", "transformers>=4.39.0,<=4.41.2"], "diffusers": ["diffusers"], "quality": QUALITY_REQUIRE, "tests": TESTS_REQUIRE, From 8ba602da07ab28349b5d253b61dd848f2bb4aab1 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 6 Jun 2024 14:06:36 +0200 Subject: [PATCH 47/49] Update optimum/exporters/ipex/modeling_utils.py --- optimum/exporters/ipex/modeling_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py index c46da2037b..a2b73e74ae 100644 --- a/optimum/exporters/ipex/modeling_utils.py +++ b/optimum/exporters/ipex/modeling_utils.py @@ -25,7 +25,7 @@ # Please also update in the setup.py and .github/workflows/test_ipex.yml if you change the transformers version -_TRANSFORMERS_MIN_VERSION = "4.38.0" +_TRANSFORMERS_MIN_VERSION = "4.39.0" _TRANSFORMERS_MAX_VERSION = "4.41.2" _IPEX_MINIMUM_VERSION_FOR_PATCHING = "2.3.0" From f15a1f53eef43fa9b9faf20bbd8ee0648d86bcf6 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Thu, 6 Jun 2024 16:38:20 -0400 Subject: [PATCH 48/49] fix --- tests/ipex/test_pipelines.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/ipex/test_pipelines.py b/tests/ipex/test_pipelines.py index e571d5a6a1..c4ae471a0f 100644 --- a/tests/ipex/test_pipelines.py +++ b/tests/ipex/test_pipelines.py @@ -56,6 +56,7 @@ class PipelinesIntegrationTest(unittest.TestCase): "gpt2", "gpt_neo", "gpt_neox", + "llama", "llama2", "mistral", "mpt", From 36ae751f6ccd3fb9b9006019d9fc6db12f09e5f2 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Thu, 6 Jun 2024 16:46:32 -0400 Subject: [PATCH 49/49] limit the new tokens of assisted decoding tests --- tests/ipex/test_modeling.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/ipex/test_modeling.py b/tests/ipex/test_modeling.py index 3f794771e7..8664b99cee 100644 --- a/tests/ipex/test_modeling.py +++ b/tests/ipex/test_modeling.py @@ -250,11 +250,13 @@ def test_assisted_decoding(self, model_arch): ipex_model = IPEXModelForCausalLM.from_pretrained(model_id, export=True) transformers_model = AutoModelForCausalLM.from_pretrained(model_id) tokens = tokenizer("This is a sample input", return_tensors="pt") - ipex_output = ipex_model.generate(**tokens, do_sample=False) - ipex_output_assisted = ipex_model.generate(**tokens, do_sample=False, assistant_model=transformers_model) - transformers_output = transformers_model.generate(**tokens, do_sample=False) + ipex_output = ipex_model.generate(**tokens, do_sample=False, max_new_tokens=4) + ipex_output_assisted = ipex_model.generate( + **tokens, do_sample=False, assistant_model=transformers_model, max_new_tokens=4 + ) + transformers_output = transformers_model.generate(**tokens, do_sample=False, max_new_tokens=4) transformers_output_assisted = transformers_model.generate( - **tokens, do_sample=False, assistant_model=ipex_model + **tokens, do_sample=False, assistant_model=ipex_model, max_new_tokens=4 ) self.assertTrue(torch.equal(ipex_output, ipex_output_assisted)) self.assertTrue(torch.equal(transformers_output, transformers_output_assisted))