From 379f3025bc88fcdc9beb7eec9e3f2e0db5b05482 Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Thu, 6 Feb 2025 10:07:38 +0100
Subject: [PATCH] Allow int8 KV cache precision

---
 src/cpp/src/continuous_batching_impl.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
index be1eba04f9..333f15dff7 100644
--- a/src/cpp/src/continuous_batching_impl.cpp
+++ b/src/cpp/src/continuous_batching_impl.cpp
@@ -53,11 +53,6 @@ void apply_kv_cache_precision(const std::shared_ptr<ov::Model>& model, const std
             // x86 and ARM have different default kv cache type, take this information from the plugin
             m_kv_cache_type = core.get_property(device, ov::hint::kv_cache_precision);
         }
-
-        // TEMP WA: currently FP16 / BF16 KV cache is faster than U8 for PagedAttention
-        if (m_kv_cache_type == ov::element::u8) {
-            m_kv_cache_type = inference_precision == ov::element::bf16 ? ov::element::bf16 : ov::element::f16;
-        }
     } else if (device.find("GPU") != std::string::npos) {
         if (accuracy_mode) {
             inference_precision = ov::element::f32;