From 3abd79067950ad06e836c99b1eddee15eece4d0a Mon Sep 17 00:00:00 2001
From: "Lin, Fanli" <fanli.lin@intel.com>
Date: Mon, 3 Jun 2024 10:35:22 -0400
Subject: [PATCH] use with and without cache

---
 optimum/exporters/ipex/modeling_utils.py | 110 ++++++++++++-----------
 1 file changed, 57 insertions(+), 53 deletions(-)

diff --git a/optimum/exporters/ipex/modeling_utils.py b/optimum/exporters/ipex/modeling_utils.py
index ea387eb41f..195c7c3829 100644
--- a/optimum/exporters/ipex/modeling_utils.py
+++ b/optimum/exporters/ipex/modeling_utils.py
@@ -172,60 +172,60 @@ def qkv_gemm(self, hidden_states):
 
         return query, key, value
 
-    def rope(self, query, key, kv_seq_len, position_ids, use_cache):
-        if use_cache:
-            key = self.ipex_rope(
-                key,
-                position_ids,
-                self.num_key_value_heads,
-                self.head_dim,
-                self.head_dim // 2,
-                self.head_dim,
-                kv_seq_len,
-            )
-            query = self.ipex_rope(
-                query,
-                position_ids,
-                self.num_heads,
-                self.head_dim,
-                self.head_dim // 2,
-                self.head_dim,
-                kv_seq_len,
-            )
+    def rope(self, query, key, kv_seq_len, position_ids):
+        key = self.ipex_rope(
+            key,
+            position_ids,
+            self.num_key_value_heads,
+            self.head_dim,
+            self.head_dim // 2,
+            self.head_dim,
+            kv_seq_len,
+        )
+        query = self.ipex_rope(
+            query,
+            position_ids,
+            self.num_heads,
+            self.head_dim,
+            self.head_dim // 2,
+            self.head_dim,
+            kv_seq_len,
+        )
         return query, key
 
-    def sdpa(self, query, key, value, past_key_value, attention_mask, use_cache):
-        if use_cache:
-            # This ipex op pre-allocates buffers for past_key_values and use beam index history
-            # which to decide which beam should be used to make attention scale dot more efficient.
-            (attn_output, attn_weights, past_key_value) = self.ipex_scale_dot_product(
-                query,
-                key,
-                value,
-                math.sqrt(self.head_dim),
-                past_key_value,
-                None,
-                attention_mask,
-            )
-        else:
-            value_states = value.transpose(1, 2)
-            query_states = query.transpose(1, 2)
-            key_states = key.transpose(1, 2)
+    def sdpa_with_cache(self, query, key, value, past_key_value, attention_mask):
+        # This ipex op pre-allocates buffers for past_key_values and use beam index history
+        # which to decide which beam should be used to make attention scale dot more efficient.
+        (attn_output, attn_weights, past_key_value) = self.ipex_scale_dot_product(
+            query,
+            key,
+            value,
+            math.sqrt(self.head_dim),
+            past_key_value,
+            None,
+            attention_mask,
+        )
+        return attn_output, past_key_value, attn_weights
+
+    def sdpa_without_cache(self, query, key, value, past_key_value, attention_mask):
+        value_states = value.transpose(1, 2)
+        query_states = query.transpose(1, 2)
+        key_states = key.transpose(1, 2)
 
-            past_key_value = None
-            # repeat k/v heads if n_kv_heads < n_heads
-            key_states = repeat_kv(key_states, self.num_key_value_groups)
-            value_states = repeat_kv(value_states, self.num_key_value_groups)
+        past_key_value = None
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
 
-            attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
 
-            if attention_mask is not None:
-                attn_weights = torch.tensor(attn_weights) + torch.tensor(attention_mask)
-                attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
+        if attention_mask is not None:
+            attn_weights = torch.tensor(attn_weights) + torch.tensor(attention_mask)
+            attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
 
-            # upcast attention to fp32
-            attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-            attn_output = torch.matmul(attn_weights, value_states)
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
 
         return attn_output, past_key_value, attn_weights
 
@@ -266,11 +266,15 @@ def forward(
         kv_seq_len = seq_len + past_key_value[0].size(-2) if past_key_value is not None else seq_len
 
         query, key, value = self.qkv_gemm(hidden_states)
-        query, key = self.rope(query, key, kv_seq_len, position_ids, use_cache)
-
-        attn_output, past_key_value, attn_weights = self.sdpa(
-            query, key, value, past_key_value, attention_mask, use_cache
-        )
+        if use_cache:
+            query, key = self.rope(query, key, kv_seq_len, position_ids)
+            attn_output, past_key_value, attn_weights = self.sdpa_with_cache(
+                query, key, value, past_key_value, attention_mask
+            )
+        else:
+            attn_output, past_key_value, attn_weights = self.sdpa_without_cache(
+                query, key, value, past_key_value, attention_mask
+            )
         attn_output = attn_output.transpose(1, 2).view(bsz, seq_len, self.hidden_size)
 
         if hasattr(self, "mha_linear_add"):