From ac18dd4c351941f9606965f71090f4db82858384 Mon Sep 17 00:00:00 2001
From: Luo Cheng <cheng.luo@intel.com>
Date: Wed, 13 Nov 2024 19:44:27 +0800
Subject: [PATCH 1/4] change kvcache default type to u8 for cpu plugin

---
 src/cpp/src/device_config.hpp | 22 +++-------------------
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/src/cpp/src/device_config.hpp b/src/cpp/src/device_config.hpp
index 2af4559ef1..2e1ad2bbda 100644
--- a/src/cpp/src/device_config.hpp
+++ b/src/cpp/src/device_config.hpp
@@ -36,30 +36,14 @@ class DeviceConfig {
         m_block_size = get_block_size_by_device(device);
 
         if (m_device == "CPU") {
-            auto inference_precision = core.get_property(device, ov::hint::inference_precision);
-            m_kv_cache_type = inference_precision == ov::element::bf16 ? ov::element::bf16 : ov::element::f16;
-
-            // if user sets precision hint, kv cache type should be changed
-            const auto inference_precision_it = plugin_config.find(ov::hint::inference_precision.name());
-            if (inference_precision_it != plugin_config.end()) {
-                const auto inference_precision = inference_precision_it->second.as<ov::element::Type>();
-                if (inference_precision == ov::element::f32) {
-                    m_kv_cache_type = ov::element::f32;
-                } else if (inference_precision == ov::element::f16) {
-                    m_kv_cache_type = ov::element::f16;
-                } else if (inference_precision == ov::element::bf16) {
-                    m_kv_cache_type = ov::element::bf16;
-                } else {
-                    // use default f32
-                    m_kv_cache_type = ov::element::f32;
-                }
-            }
-
             // if user sets ov::kv_cache_precision hint
             const auto kv_cache_precision_it = plugin_config.find(ov::hint::kv_cache_precision.name());
             if (kv_cache_precision_it != plugin_config.end()) {
                 const auto kv_cache_precision = kv_cache_precision_it->second.as<ov::element::Type>();
                 m_kv_cache_type = kv_cache_precision;
+            } else {
+                // x86 and arm have different default kv cache type
+                m_kv_cache_type = core.get_property(device, ov::hint::kv_cache_precision);
             }
         } else if (m_device.find("GPU") != std::string::npos) {
             auto inference_precision = core.get_property(device, ov::hint::inference_precision);

From ffef13ec6d1f219467f216cde4cc31be66d7b45c Mon Sep 17 00:00:00 2001
From: Luo Cheng <cheng.luo@intel.com>
Date: Thu, 14 Nov 2024 15:09:12 +0800
Subject: [PATCH 2/4] use f32 for hint: EXECUTION_MODE_HINT:ACCURACY

---
 src/cpp/src/device_config.hpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/cpp/src/device_config.hpp b/src/cpp/src/device_config.hpp
index 2e1ad2bbda..f184035a83 100644
--- a/src/cpp/src/device_config.hpp
+++ b/src/cpp/src/device_config.hpp
@@ -42,8 +42,14 @@ class DeviceConfig {
                 const auto kv_cache_precision = kv_cache_precision_it->second.as<ov::element::Type>();
                 m_kv_cache_type = kv_cache_precision;
             } else {
-                // x86 and arm have different default kv cache type
-                m_kv_cache_type = core.get_property(device, ov::hint::kv_cache_precision);
+                // ACCURACY mode will use f32 kvcache
+                const auto execution_mode_it = plugin_config.find(ov::hint::execution_mode.name());
+                if (execution_mode_it != plugin_config.end() && execution_mode_it->second.as<ov::hint::ExecutionMode>() == ov::hint::ExecutionMode::ACCURACY) {
+                    m_kv_cache_type = ov::element::f32;
+                } else {
+                    // x86 and arm have different default kv cache type
+                    m_kv_cache_type = core.get_property(device, ov::hint::kv_cache_precision);
+                }
             }
         } else if (m_device.find("GPU") != std::string::npos) {
             auto inference_precision = core.get_property(device, ov::hint::inference_precision);

From 9efab9df8cf2f975a7c20d208e6ca3ea5f7e38e4 Mon Sep 17 00:00:00 2001
From: Luo Cheng <cheng.luo@intel.com>
Date: Thu, 14 Nov 2024 16:10:17 +0800
Subject: [PATCH 3/4] Revert "[GHA]: hardcode OpenVINO commit (#1212)"

This reverts commit 9243a8fc5d13ebd5bf601432c4b2ede897066a41.
---
 .github/workflows/linux.yml   | 2 +-
 .github/workflows/mac.yml     | 2 +-
 .github/workflows/windows.yml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 3c3e0347e7..5958dafe33 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -17,7 +17,7 @@ concurrency:
 
 env:
   PYTHON_VERSION: '3.9'
-  OV_BRANCH: 0080d90974ca84f9a6d359da3388a2a18a93b753
+  OV_BRANCH: ${{ github.base_ref || github.event.merge_group.base_ref || github.ref }}
   OV_TARBALL: ''
 
 jobs:
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index 935d6556b3..25da21b209 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -17,7 +17,7 @@ concurrency:
 
 env:
   PYTHON_VERSION: '3.9'
-  OV_BRANCH: 0080d90974ca84f9a6d359da3388a2a18a93b753
+  OV_BRANCH: ${{ github.base_ref || github.event.merge_group.base_ref || github.ref }}
   OV_TARBALL: ''
 
 jobs:
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index aef0181baa..33096d6d7b 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -17,7 +17,7 @@ concurrency:
 
 env:
   PYTHON_VERSION: '3.11'
-  OV_BRANCH: 0080d90974ca84f9a6d359da3388a2a18a93b753
+  OV_BRANCH: ${{ github.base_ref || github.event.merge_group.base_ref || github.ref }}
   OV_TARBALL: ''
 
 jobs:

From 400e41c1395ff914f4f3314725c24e7cab95bcec Mon Sep 17 00:00:00 2001
From: Luo Cheng <cheng.luo@intel.com>
Date: Tue, 31 Dec 2024 09:10:15 +0100
Subject: [PATCH 4/4] fix ci errors

---
 tests/cpp/cache_manager.cpp | 41 ++++++++++++-------------------------
 tests/cpp/device_config.cpp |  8 ++++----
 tests/cpp/helper.cpp        | 29 ++++++++++++++++++++++++++
 tests/cpp/helper.hpp        |  8 ++++++++
 tests/cpp/scheduler.cpp     | 26 ++---------------------
 5 files changed, 56 insertions(+), 56 deletions(-)
 create mode 100644 tests/cpp/helper.cpp
 create mode 100644 tests/cpp/helper.hpp

diff --git a/tests/cpp/cache_manager.cpp b/tests/cpp/cache_manager.cpp
index 095cc39f09..87b24bb575 100644
--- a/tests/cpp/cache_manager.cpp
+++ b/tests/cpp/cache_manager.cpp
@@ -7,34 +7,10 @@
 #include "scheduler.hpp"
 #include "device_config.hpp"
 #include "cache_manager.hpp"
-#include "openvino/op/concat.hpp"
+#include "helper.hpp"
 
 using namespace ov::genai;
 
-std::shared_ptr<ov::Model> get_dummy_model(ov::Core core, size_t num_layers) {
-    ov::NodeVector keys;
-    ov::NodeVector values;
-    ov::ParameterVector params;
-    ov::element::Type inference_precision = core.get_property("CPU", ov::hint::inference_precision);
-    ov::element::Type kv_cache_type = inference_precision == ov::element::bf16 ? ov::element::bf16 : ov::element::f16;
-
-    auto shape = ov::PartialShape({ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic()});
-    for (size_t i = 0; i < num_layers; i++) {
-        auto key = std::make_shared<ov::op::v0::Parameter>(kv_cache_type, shape);
-        auto value = std::make_shared<ov::op::v0::Parameter>(kv_cache_type, shape);
-        key->get_output_tensor(0).set_names({"key_cache." + std::to_string(i)});
-        value->get_output_tensor(0).set_names({"value_cache." + std::to_string(i)});
-        keys.push_back(key);
-        values.push_back(value);
-        params.push_back(key);
-        params.push_back(value);
-    }
-    const auto& concat1 = std::make_shared<ov::op::v0::Concat>(keys, 1);
-    const auto& concat2 = std::make_shared<ov::op::v0::Concat>(values, 1);
-    auto model = std::make_shared<ov::Model>(ov::NodeVector{concat1, concat2}, params);
-    return std::make_shared<ov::Model>(ov::NodeVector{concat1, concat2}, params);
-}
-
 size_t get_total_allocated_bytes(std::shared_ptr<ov::genai::CacheManager> cache_manager, size_t num_decoder_layers) {
     size_t allocated_bytes = 0;
     for (size_t i = 0; i < num_decoder_layers; i++) {
@@ -58,14 +34,23 @@ TEST(TestCacheManager, test_cache_size_param) {
     ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU");
     size_t num_decoder_layers = 12;
     std::vector<size_t> num_kv_heads(12, 12);
-    device_config.set_model_params(num_kv_heads, 64, num_decoder_layers);
+    size_t head_size = 64;
+    device_config.set_model_params(num_kv_heads, head_size, num_decoder_layers);
 
     ov::InferRequest request = core.compile_model(get_dummy_model(core, num_decoder_layers)).create_infer_request();
     auto cache_manager = std::make_shared<ov::genai::CacheManager>(device_config, request, core);
     auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers());
     cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks());
-    
-    ASSERT_EQ(get_total_allocated_bytes(cache_manager, num_decoder_layers), 2146959360);
+
+    const size_t kv_cache_total_size = scheduler_config.cache_size * 1024 * 1024 * 1024;
+    const size_t cpu_block_size = 32;
+    // For u8 kvcahce, its scale, zero point and quantized data will be stored together.
+    // The layout for per token per head:
+    // |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)|
+    // so, we have to extend head_size by 2 * sizeof(float)
+    const size_t cpu_block_size_total = num_decoder_layers * (num_kv_heads[0] + num_kv_heads[1]) * cpu_block_size * (head_size + 2 * sizeof(float)) * sizeof(uint8_t);
+    size_t expected_size = kv_cache_total_size / cpu_block_size_total * cpu_block_size_total;
+    ASSERT_EQ(get_total_allocated_bytes(cache_manager, num_decoder_layers), expected_size);
 }
 
 
diff --git a/tests/cpp/device_config.cpp b/tests/cpp/device_config.cpp
index 973648f637..9af87bab56 100644
--- a/tests/cpp/device_config.cpp
+++ b/tests/cpp/device_config.cpp
@@ -20,12 +20,12 @@ TEST(TestDeviceConfig, kv_cache_precision_u8) {
     size_t head_size = 64, head_size_u8 = head_size + 8;
     std::vector<size_t> num_kv_heads(12, 12);
 
-    ov::genai::DeviceConfig device_config_default(core, scheduler_config, "CPU");
-    device_config_default.set_model_params(num_kv_heads, head_size_u8, num_decoder_layers);
+    ov::genai::DeviceConfig device_config_f16(core, scheduler_config, "CPU", { ov::hint::kv_cache_precision(ov::element::f16) });
+    device_config_f16.set_model_params(num_kv_heads, head_size_u8, num_decoder_layers);
 
-    ov::genai::DeviceConfig device_config_u8(core, scheduler_config, "CPU", { ov::hint::kv_cache_precision(ov::element::u8) });
+    ov::genai::DeviceConfig device_config_u8(core, scheduler_config, "CPU");
     device_config_u8.set_model_params(num_kv_heads, head_size, num_decoder_layers);
 
     const auto ratio = ov::element::f16.size() / ov::element::u8.size();
-    ASSERT_EQ(device_config_default.get_num_kv_blocks() * ratio, device_config_u8.get_num_kv_blocks());
+    ASSERT_EQ(device_config_f16.get_num_kv_blocks() * ratio, device_config_u8.get_num_kv_blocks());
 }
diff --git a/tests/cpp/helper.cpp b/tests/cpp/helper.cpp
new file mode 100644
index 0000000000..6dd7664523
--- /dev/null
+++ b/tests/cpp/helper.cpp
@@ -0,0 +1,29 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "helper.hpp"
+#include "openvino/op/concat.hpp"
+
+std::shared_ptr<ov::Model> get_dummy_model(ov::Core core, size_t num_layers) {
+    ov::NodeVector keys;
+    ov::NodeVector values;
+    ov::ParameterVector params;
+    ov::element::Type inference_precision = core.get_property("CPU", ov::hint::inference_precision);
+    ov::element::Type kv_cache_type = core.get_property("CPU", ov::hint::kv_cache_precision);
+
+    auto shape = ov::PartialShape({ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic()});
+    for (size_t i = 0; i < num_layers; i++) {
+        auto key = std::make_shared<ov::op::v0::Parameter>(kv_cache_type, shape);
+        auto value = std::make_shared<ov::op::v0::Parameter>(kv_cache_type, shape);
+        key->get_output_tensor(0).set_names({"key_cache." + std::to_string(i)});
+        value->get_output_tensor(0).set_names({"value_cache." + std::to_string(i)});
+        keys.push_back(key);
+        values.push_back(value);
+        params.push_back(key);
+        params.push_back(value);
+    }
+    const auto& concat1 = std::make_shared<ov::op::v0::Concat>(keys, 1);
+    const auto& concat2 = std::make_shared<ov::op::v0::Concat>(values, 1);
+    auto model = std::make_shared<ov::Model>(ov::NodeVector{concat1, concat2}, params);
+    return std::make_shared<ov::Model>(ov::NodeVector{concat1, concat2}, params);
+}
diff --git a/tests/cpp/helper.hpp b/tests/cpp/helper.hpp
new file mode 100644
index 0000000000..1fafe8bcf6
--- /dev/null
+++ b/tests/cpp/helper.hpp
@@ -0,0 +1,8 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "openvino/runtime/core.hpp"
+
+std::shared_ptr<ov::Model> get_dummy_model(ov::Core core, size_t num_layers);
\ No newline at end of file
diff --git a/tests/cpp/scheduler.cpp b/tests/cpp/scheduler.cpp
index 23594adf50..6eeb8b8ed4 100644
--- a/tests/cpp/scheduler.cpp
+++ b/tests/cpp/scheduler.cpp
@@ -9,6 +9,7 @@
 #include "openvino/genai/generation_config.hpp"
 #include "sequence_group.hpp"
 #include "scheduler.hpp"
+#include "helper.hpp"
 
 using namespace ov::genai;
 
@@ -18,34 +19,11 @@ void clear_finished_sequences(std::vector<SequenceGroup::Ptr>& requests) {
     });
     requests.erase(new_end, requests.end());
 }
-std::shared_ptr<ov::Model> get_model(ov::Core core, size_t num_layers) {
-    ov::NodeVector keys;
-    ov::NodeVector values;
-    ov::ParameterVector params;
-    ov::element::Type inference_precision = core.get_property("CPU", ov::hint::inference_precision);
-    ov::element::Type kv_cache_type = inference_precision == ov::element::bf16 ? ov::element::bf16 : ov::element::f16;
-
-    auto shape = ov::PartialShape({ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic()});
-    for (size_t i = 0; i < num_layers; i++) {
-        auto key = std::make_shared<ov::op::v0::Parameter>(kv_cache_type, shape);
-        auto value = std::make_shared<ov::op::v0::Parameter>(kv_cache_type, shape);
-        key->get_output_tensor(0).set_names({"key_cache." + std::to_string(i)});
-        value->get_output_tensor(0).set_names({"value_cache." + std::to_string(i)});
-        keys.push_back(key);
-        values.push_back(value);
-        params.push_back(key);
-        params.push_back(value);
-    }
-    const auto& concat1 = std::make_shared<ov::op::v0::Concat>(keys, 1);
-    const auto& concat2 = std::make_shared<ov::op::v0::Concat>(values, 1);
-    auto model = std::make_shared<ov::Model>(ov::NodeVector{concat1, concat2}, params);
-    return std::make_shared<ov::Model>(ov::NodeVector{concat1, concat2}, params);
-}
 
 std::shared_ptr<CacheManager> init_cache_manager(SchedulerConfig scheduler_config) {
     ov::Core core = ov::Core();
     size_t num_decoder_layers = 12;
-    ov::InferRequest request = core.compile_model(get_model(core, num_decoder_layers)).create_infer_request();
+    ov::InferRequest request = core.compile_model(get_dummy_model(core, num_decoder_layers)).create_infer_request();
     size_t head_size = 64, head_size_u8 = head_size + 8;
     std::vector<size_t> num_kv_heads(12, 12);
     ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU");