From a7e99181f81052164946484ef1fafa02fb2b392d Mon Sep 17 00:00:00 2001
From: charlifu <charlifu@amd.com>
Date: Thu, 14 Nov 2024 16:06:57 +0000
Subject: [PATCH 1/2] add heuristic logic for weight padding

---
 vllm/model_executor/layers/quantization/fp8.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 0803dcba5cbd2..de062d609fdfa 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -248,8 +248,10 @@ def process_weights_after_loading(self, layer: Module) -> None:
                 )
 
             # Pad the weight
-            if envs.VLLM_FP8_PADDING:
-                weight = F.pad(weight, (0, 256), "constant", 0)[..., :-256]
+            if envs.VLLM_FP8_PADDING and weight.stride(-1) == 1 \
+                and (weight.stride(-2) * weight.element_size()) % 512 == 0:
+                num_pad = 256 // weight.element_size()
+                weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
                 torch.cuda.empty_cache()
 
             # Update layer with new values.

From 7e8afeb2750313df7f43963e8fe1f8419232fc8c Mon Sep 17 00:00:00 2001
From: charlifu <charlifu@amd.com>
Date: Thu, 14 Nov 2024 16:16:09 +0000
Subject: [PATCH 2/2] lint

---
 vllm/model_executor/layers/quantization/fp8.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index de062d609fdfa..205a7e19811e8 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -251,7 +251,8 @@ def process_weights_after_loading(self, layer: Module) -> None:
             if envs.VLLM_FP8_PADDING and weight.stride(-1) == 1 \
                 and (weight.stride(-2) * weight.element_size()) % 512 == 0:
                 num_pad = 256 // weight.element_size()
-                weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
+                weight = F.pad(weight, (0, num_pad), "constant",
+                               0)[..., :-num_pad]
                 torch.cuda.empty_cache()
 
             # Update layer with new values.