PaddlePaddle · Jiang-Jia-Jun · Jan 6, 2026 · Jan 5, 2026
diff --git a/custom_ops/utils/auto_gen_w4afp8_gemm_kernel.py b/custom_ops/utils/auto_gen_w4afp8_gemm_kernel.py
@@ -94,6 +94,8 @@
     [2560, 1536, 64, 0, 128],
     [1536, 2560, 64, 0, 128],
     [2560, 768, 64, 0, 128],
+    [768, 2048, 128, 0, 128],
+    [2048, 384, 128, 0, 128],
 ]
 
 dtype = ["BF16"]

diff --git a/fastdeploy/model_executor/layers/quantization/__init__.py b/fastdeploy/model_executor/layers/quantization/__init__.py
@@ -36,6 +36,19 @@
 ]
 
 
+def _compute_hadamard_block_size(moe_intermediate_size: int, tp_size: int) -> int:
+    if moe_intermediate_size % tp_size != 0:
+        raise ValueError(
+            f"moe_intermediate_size ({moe_intermediate_size}) must be divisible by " f"tp_size ({tp_size})"
+        )
+
+    shard_size = moe_intermediate_size // tp_size
+    block_size = shard_size & (-shard_size)
+    block_size = min(block_size, 512)
+
+    return block_size
+
+
 def parse_quant_config(args, model_config, is_ernie, is_v1_loader):
     if args.quantization is not None and isinstance(args.quantization, str):
         args.quantization = parse_quantization(args.quantization)
@@ -89,7 +102,12 @@ def parse_quant_config(args, model_config, is_ernie, is_v1_loader):
             quantization_config["dense_quant_type"] = "block_wise_fp8"
             quantization_config["moe_quant_type"] = "w4afp8"
             tp_size = getattr(args, "tensor_parallel_size", 1)
-            quantization_config["hadamard_block_size"] = 512 // tp_size
+            moe_intermediate_size = getattr(model_config, "moe_intermediate_size", None)
+            if moe_intermediate_size is not None:
+                hadamard_block_size = _compute_hadamard_block_size(moe_intermediate_size, tp_size)
+                quantization_config["hadamard_block_size"] = hadamard_block_size
+            else:
+                quantization_config["hadamard_block_size"] = 512
             quantization_config["quantization"] = "mix_quant"
             quant_config_name = "mix_quant"
     else:

diff --git a/...Lite_with_w4afp8/test_ernie_4_5_w4afp8.py → ...se/w4afp8/test_moe_w4afp8_online_quant.py b/...Lite_with_w4afp8/test_ernie_4_5_w4afp8.py → ...se/w4afp8/test_moe_w4afp8_online_quant.py
@@ -49,6 +49,12 @@
         "model_name": "ERNIE-4.5-21B-A3B-PT",
         "model_subdir": "torch",
     },
+    {
+        "id": "w4afp8_default_v1",
+        "load_choices": "default_v1",
+        "model_name": "Qwen3-30B-A3B",
+        "model_subdir": "torch",
+    },
 ]