From 379f3025bc88fcdc9beb7eec9e3f2e0db5b05482 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Thu, 6 Feb 2025 10:07:38 +0100 Subject: [PATCH] Allow int8 KV cache precision --- src/cpp/src/continuous_batching_impl.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index be1eba04f9..333f15dff7 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -53,11 +53,6 @@ void apply_kv_cache_precision(const std::shared_ptr& model, const std // x86 and ARM have different default kv cache type, take this information from the plugin m_kv_cache_type = core.get_property(device, ov::hint::kv_cache_precision); } - - // TEMP WA: currently FP16 / BF16 KV cache is faster than U8 for PagedAttention - if (m_kv_cache_type == ov::element::u8) { - m_kv_cache_type = inference_precision == ov::element::bf16 ? ov::element::bf16 : ov::element::f16; - } } else if (device.find("GPU") != std::string::npos) { if (accuracy_mode) { inference_precision = ov::element::f32;