From ac18dd4c351941f9606965f71090f4db82858384 Mon Sep 17 00:00:00 2001 From: Luo Cheng Date: Wed, 13 Nov 2024 19:44:27 +0800 Subject: [PATCH 1/4] change kvcache default type to u8 for cpu plugin --- src/cpp/src/device_config.hpp | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/src/cpp/src/device_config.hpp b/src/cpp/src/device_config.hpp index 2af4559ef1..2e1ad2bbda 100644 --- a/src/cpp/src/device_config.hpp +++ b/src/cpp/src/device_config.hpp @@ -36,30 +36,14 @@ class DeviceConfig { m_block_size = get_block_size_by_device(device); if (m_device == "CPU") { - auto inference_precision = core.get_property(device, ov::hint::inference_precision); - m_kv_cache_type = inference_precision == ov::element::bf16 ? ov::element::bf16 : ov::element::f16; - - // if user sets precision hint, kv cache type should be changed - const auto inference_precision_it = plugin_config.find(ov::hint::inference_precision.name()); - if (inference_precision_it != plugin_config.end()) { - const auto inference_precision = inference_precision_it->second.as(); - if (inference_precision == ov::element::f32) { - m_kv_cache_type = ov::element::f32; - } else if (inference_precision == ov::element::f16) { - m_kv_cache_type = ov::element::f16; - } else if (inference_precision == ov::element::bf16) { - m_kv_cache_type = ov::element::bf16; - } else { - // use default f32 - m_kv_cache_type = ov::element::f32; - } - } - // if user sets ov::kv_cache_precision hint const auto kv_cache_precision_it = plugin_config.find(ov::hint::kv_cache_precision.name()); if (kv_cache_precision_it != plugin_config.end()) { const auto kv_cache_precision = kv_cache_precision_it->second.as(); m_kv_cache_type = kv_cache_precision; + } else { + // x86 and arm have different default kv cache type + m_kv_cache_type = core.get_property(device, ov::hint::kv_cache_precision); } } else if (m_device.find("GPU") != std::string::npos) { auto inference_precision = core.get_property(device, ov::hint::inference_precision); From ffef13ec6d1f219467f216cde4cc31be66d7b45c Mon Sep 17 00:00:00 2001 From: Luo Cheng Date: Thu, 14 Nov 2024 15:09:12 +0800 Subject: [PATCH 2/4] use f32 for hint: EXECUTION_MODE_HINT:ACCURACY --- src/cpp/src/device_config.hpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/cpp/src/device_config.hpp b/src/cpp/src/device_config.hpp index 2e1ad2bbda..f184035a83 100644 --- a/src/cpp/src/device_config.hpp +++ b/src/cpp/src/device_config.hpp @@ -42,8 +42,14 @@ class DeviceConfig { const auto kv_cache_precision = kv_cache_precision_it->second.as(); m_kv_cache_type = kv_cache_precision; } else { - // x86 and arm have different default kv cache type - m_kv_cache_type = core.get_property(device, ov::hint::kv_cache_precision); + // ACCURACY mode will use f32 kvcache + const auto execution_mode_it = plugin_config.find(ov::hint::execution_mode.name()); + if (execution_mode_it != plugin_config.end() && execution_mode_it->second.as() == ov::hint::ExecutionMode::ACCURACY) { + m_kv_cache_type = ov::element::f32; + } else { + // x86 and arm have different default kv cache type + m_kv_cache_type = core.get_property(device, ov::hint::kv_cache_precision); + } } } else if (m_device.find("GPU") != std::string::npos) { auto inference_precision = core.get_property(device, ov::hint::inference_precision); From 9efab9df8cf2f975a7c20d208e6ca3ea5f7e38e4 Mon Sep 17 00:00:00 2001 From: Luo Cheng Date: Thu, 14 Nov 2024 16:10:17 +0800 Subject: [PATCH 3/4] Revert "[GHA]: hardcode OpenVINO commit (#1212)" This reverts commit 9243a8fc5d13ebd5bf601432c4b2ede897066a41. --- .github/workflows/linux.yml | 2 +- .github/workflows/mac.yml | 2 +- .github/workflows/windows.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 3c3e0347e7..5958dafe33 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -17,7 +17,7 @@ concurrency: env: PYTHON_VERSION: '3.9' - OV_BRANCH: 0080d90974ca84f9a6d359da3388a2a18a93b753 + OV_BRANCH: ${{ github.base_ref || github.event.merge_group.base_ref || github.ref }} OV_TARBALL: '' jobs: diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index 935d6556b3..25da21b209 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -17,7 +17,7 @@ concurrency: env: PYTHON_VERSION: '3.9' - OV_BRANCH: 0080d90974ca84f9a6d359da3388a2a18a93b753 + OV_BRANCH: ${{ github.base_ref || github.event.merge_group.base_ref || github.ref }} OV_TARBALL: '' jobs: diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index aef0181baa..33096d6d7b 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -17,7 +17,7 @@ concurrency: env: PYTHON_VERSION: '3.11' - OV_BRANCH: 0080d90974ca84f9a6d359da3388a2a18a93b753 + OV_BRANCH: ${{ github.base_ref || github.event.merge_group.base_ref || github.ref }} OV_TARBALL: '' jobs: From 400e41c1395ff914f4f3314725c24e7cab95bcec Mon Sep 17 00:00:00 2001 From: Luo Cheng Date: Tue, 31 Dec 2024 09:10:15 +0100 Subject: [PATCH 4/4] fix ci errors --- tests/cpp/cache_manager.cpp | 41 ++++++++++++------------------------- tests/cpp/device_config.cpp | 8 ++++---- tests/cpp/helper.cpp | 29 ++++++++++++++++++++++++++ tests/cpp/helper.hpp | 8 ++++++++ tests/cpp/scheduler.cpp | 26 ++--------------------- 5 files changed, 56 insertions(+), 56 deletions(-) create mode 100644 tests/cpp/helper.cpp create mode 100644 tests/cpp/helper.hpp diff --git a/tests/cpp/cache_manager.cpp b/tests/cpp/cache_manager.cpp index 095cc39f09..87b24bb575 100644 --- a/tests/cpp/cache_manager.cpp +++ b/tests/cpp/cache_manager.cpp @@ -7,34 +7,10 @@ #include "scheduler.hpp" #include "device_config.hpp" #include "cache_manager.hpp" -#include "openvino/op/concat.hpp" +#include "helper.hpp" using namespace ov::genai; -std::shared_ptr get_dummy_model(ov::Core core, size_t num_layers) { - ov::NodeVector keys; - ov::NodeVector values; - ov::ParameterVector params; - ov::element::Type inference_precision = core.get_property("CPU", ov::hint::inference_precision); - ov::element::Type kv_cache_type = inference_precision == ov::element::bf16 ? ov::element::bf16 : ov::element::f16; - - auto shape = ov::PartialShape({ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic()}); - for (size_t i = 0; i < num_layers; i++) { - auto key = std::make_shared(kv_cache_type, shape); - auto value = std::make_shared(kv_cache_type, shape); - key->get_output_tensor(0).set_names({"key_cache." + std::to_string(i)}); - value->get_output_tensor(0).set_names({"value_cache." + std::to_string(i)}); - keys.push_back(key); - values.push_back(value); - params.push_back(key); - params.push_back(value); - } - const auto& concat1 = std::make_shared(keys, 1); - const auto& concat2 = std::make_shared(values, 1); - auto model = std::make_shared(ov::NodeVector{concat1, concat2}, params); - return std::make_shared(ov::NodeVector{concat1, concat2}, params); -} - size_t get_total_allocated_bytes(std::shared_ptr cache_manager, size_t num_decoder_layers) { size_t allocated_bytes = 0; for (size_t i = 0; i < num_decoder_layers; i++) { @@ -58,14 +34,23 @@ TEST(TestCacheManager, test_cache_size_param) { ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU"); size_t num_decoder_layers = 12; std::vector num_kv_heads(12, 12); - device_config.set_model_params(num_kv_heads, 64, num_decoder_layers); + size_t head_size = 64; + device_config.set_model_params(num_kv_heads, head_size, num_decoder_layers); ov::InferRequest request = core.compile_model(get_dummy_model(core, num_decoder_layers)).create_infer_request(); auto cache_manager = std::make_shared(device_config, request, core); auto block_manager = BlockManager(device_config.get_num_kv_blocks(), false, device_config.get_block_size(), device_config.get_num_layers()); cache_manager->allocate_cache_if_needed(block_manager.get_total_number_of_kv_blocks()); - - ASSERT_EQ(get_total_allocated_bytes(cache_manager, num_decoder_layers), 2146959360); + + const size_t kv_cache_total_size = scheduler_config.cache_size * 1024 * 1024 * 1024; + const size_t cpu_block_size = 32; + // For u8 kvcahce, its scale, zero point and quantized data will be stored together. + // The layout for per token per head: + // |scale(f32)|zeropoint(f32)|quantized data(u8,idx_1)|quantized data(u8,idx_2)|...|quantized data(u8,idx_head_size)| + // so, we have to extend head_size by 2 * sizeof(float) + const size_t cpu_block_size_total = num_decoder_layers * (num_kv_heads[0] + num_kv_heads[1]) * cpu_block_size * (head_size + 2 * sizeof(float)) * sizeof(uint8_t); + size_t expected_size = kv_cache_total_size / cpu_block_size_total * cpu_block_size_total; + ASSERT_EQ(get_total_allocated_bytes(cache_manager, num_decoder_layers), expected_size); } diff --git a/tests/cpp/device_config.cpp b/tests/cpp/device_config.cpp index 973648f637..9af87bab56 100644 --- a/tests/cpp/device_config.cpp +++ b/tests/cpp/device_config.cpp @@ -20,12 +20,12 @@ TEST(TestDeviceConfig, kv_cache_precision_u8) { size_t head_size = 64, head_size_u8 = head_size + 8; std::vector num_kv_heads(12, 12); - ov::genai::DeviceConfig device_config_default(core, scheduler_config, "CPU"); - device_config_default.set_model_params(num_kv_heads, head_size_u8, num_decoder_layers); + ov::genai::DeviceConfig device_config_f16(core, scheduler_config, "CPU", { ov::hint::kv_cache_precision(ov::element::f16) }); + device_config_f16.set_model_params(num_kv_heads, head_size_u8, num_decoder_layers); - ov::genai::DeviceConfig device_config_u8(core, scheduler_config, "CPU", { ov::hint::kv_cache_precision(ov::element::u8) }); + ov::genai::DeviceConfig device_config_u8(core, scheduler_config, "CPU"); device_config_u8.set_model_params(num_kv_heads, head_size, num_decoder_layers); const auto ratio = ov::element::f16.size() / ov::element::u8.size(); - ASSERT_EQ(device_config_default.get_num_kv_blocks() * ratio, device_config_u8.get_num_kv_blocks()); + ASSERT_EQ(device_config_f16.get_num_kv_blocks() * ratio, device_config_u8.get_num_kv_blocks()); } diff --git a/tests/cpp/helper.cpp b/tests/cpp/helper.cpp new file mode 100644 index 0000000000..6dd7664523 --- /dev/null +++ b/tests/cpp/helper.cpp @@ -0,0 +1,29 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#include "helper.hpp" +#include "openvino/op/concat.hpp" + +std::shared_ptr get_dummy_model(ov::Core core, size_t num_layers) { + ov::NodeVector keys; + ov::NodeVector values; + ov::ParameterVector params; + ov::element::Type inference_precision = core.get_property("CPU", ov::hint::inference_precision); + ov::element::Type kv_cache_type = core.get_property("CPU", ov::hint::kv_cache_precision); + + auto shape = ov::PartialShape({ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic()}); + for (size_t i = 0; i < num_layers; i++) { + auto key = std::make_shared(kv_cache_type, shape); + auto value = std::make_shared(kv_cache_type, shape); + key->get_output_tensor(0).set_names({"key_cache." + std::to_string(i)}); + value->get_output_tensor(0).set_names({"value_cache." + std::to_string(i)}); + keys.push_back(key); + values.push_back(value); + params.push_back(key); + params.push_back(value); + } + const auto& concat1 = std::make_shared(keys, 1); + const auto& concat2 = std::make_shared(values, 1); + auto model = std::make_shared(ov::NodeVector{concat1, concat2}, params); + return std::make_shared(ov::NodeVector{concat1, concat2}, params); +} diff --git a/tests/cpp/helper.hpp b/tests/cpp/helper.hpp new file mode 100644 index 0000000000..1fafe8bcf6 --- /dev/null +++ b/tests/cpp/helper.hpp @@ -0,0 +1,8 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "openvino/runtime/core.hpp" + +std::shared_ptr get_dummy_model(ov::Core core, size_t num_layers); \ No newline at end of file diff --git a/tests/cpp/scheduler.cpp b/tests/cpp/scheduler.cpp index 23594adf50..6eeb8b8ed4 100644 --- a/tests/cpp/scheduler.cpp +++ b/tests/cpp/scheduler.cpp @@ -9,6 +9,7 @@ #include "openvino/genai/generation_config.hpp" #include "sequence_group.hpp" #include "scheduler.hpp" +#include "helper.hpp" using namespace ov::genai; @@ -18,34 +19,11 @@ void clear_finished_sequences(std::vector& requests) { }); requests.erase(new_end, requests.end()); } -std::shared_ptr get_model(ov::Core core, size_t num_layers) { - ov::NodeVector keys; - ov::NodeVector values; - ov::ParameterVector params; - ov::element::Type inference_precision = core.get_property("CPU", ov::hint::inference_precision); - ov::element::Type kv_cache_type = inference_precision == ov::element::bf16 ? ov::element::bf16 : ov::element::f16; - - auto shape = ov::PartialShape({ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic(), ov::Dimension::dynamic()}); - for (size_t i = 0; i < num_layers; i++) { - auto key = std::make_shared(kv_cache_type, shape); - auto value = std::make_shared(kv_cache_type, shape); - key->get_output_tensor(0).set_names({"key_cache." + std::to_string(i)}); - value->get_output_tensor(0).set_names({"value_cache." + std::to_string(i)}); - keys.push_back(key); - values.push_back(value); - params.push_back(key); - params.push_back(value); - } - const auto& concat1 = std::make_shared(keys, 1); - const auto& concat2 = std::make_shared(values, 1); - auto model = std::make_shared(ov::NodeVector{concat1, concat2}, params); - return std::make_shared(ov::NodeVector{concat1, concat2}, params); -} std::shared_ptr init_cache_manager(SchedulerConfig scheduler_config) { ov::Core core = ov::Core(); size_t num_decoder_layers = 12; - ov::InferRequest request = core.compile_model(get_model(core, num_decoder_layers)).create_infer_request(); + ov::InferRequest request = core.compile_model(get_dummy_model(core, num_decoder_layers)).create_infer_request(); size_t head_size = 64, head_size_u8 = head_size + 8; std::vector num_kv_heads(12, 12); ov::genai::DeviceConfig device_config(core, scheduler_config, "CPU");