diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py index c8f9e86a9cc7c..388d3a91228c8 100644 --- a/vllm/model_executor/layers/quantization/kv_cache.py +++ b/vllm/model_executor/layers/quantization/kv_cache.py @@ -89,6 +89,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: q_scale = layer.q_scale.to("cpu").tolist() if current_platform.is_rocm() and not is_navi(): q_scale *= 2 + layer.calculate_kv_scales = False else: q_scale = 1.0 if layer.prob_scale > 0.0: