From 770693f3fa595a5f66a9ea504687a9fd6865d4e9 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Mon, 5 Jan 2026 12:59:47 +0000 Subject: [PATCH] fix hadamard_block_size --- .../utils/auto_gen_w4afp8_gemm_kernel.py | 2 ++ .../layers/quantization/__init__.py | 20 ++++++++++++++++++- .../test_moe_w4afp8_online_quant.py} | 6 ++++++ 3 files changed, 27 insertions(+), 1 deletion(-) rename tests/ci_use/{EB_Lite_with_w4afp8/test_ernie_4_5_w4afp8.py => w4afp8/test_moe_w4afp8_online_quant.py} (98%) diff --git a/custom_ops/utils/auto_gen_w4afp8_gemm_kernel.py b/custom_ops/utils/auto_gen_w4afp8_gemm_kernel.py index 194da2bdde6..d325cdc5c86 100644 --- a/custom_ops/utils/auto_gen_w4afp8_gemm_kernel.py +++ b/custom_ops/utils/auto_gen_w4afp8_gemm_kernel.py @@ -94,6 +94,8 @@ [2560, 1536, 64, 0, 128], [1536, 2560, 64, 0, 128], [2560, 768, 64, 0, 128], + [768, 2048, 128, 0, 128], + [2048, 384, 128, 0, 128], ] dtype = ["BF16"] diff --git a/fastdeploy/model_executor/layers/quantization/__init__.py b/fastdeploy/model_executor/layers/quantization/__init__.py index 2f2421b3beb..5d8ede07894 100644 --- a/fastdeploy/model_executor/layers/quantization/__init__.py +++ b/fastdeploy/model_executor/layers/quantization/__init__.py @@ -36,6 +36,19 @@ ] +def _compute_hadamard_block_size(moe_intermediate_size: int, tp_size: int) -> int: + if moe_intermediate_size % tp_size != 0: + raise ValueError( + f"moe_intermediate_size ({moe_intermediate_size}) must be divisible by " f"tp_size ({tp_size})" + ) + + shard_size = moe_intermediate_size // tp_size + block_size = shard_size & (-shard_size) + block_size = min(block_size, 512) + + return block_size + + def parse_quant_config(args, model_config, is_ernie, is_v1_loader): if args.quantization is not None and isinstance(args.quantization, str): args.quantization = parse_quantization(args.quantization) @@ -89,7 +102,12 @@ def parse_quant_config(args, model_config, is_ernie, is_v1_loader): quantization_config["dense_quant_type"] = "block_wise_fp8" quantization_config["moe_quant_type"] = "w4afp8" tp_size = getattr(args, "tensor_parallel_size", 1) - quantization_config["hadamard_block_size"] = 512 // tp_size + moe_intermediate_size = getattr(model_config, "moe_intermediate_size", None) + if moe_intermediate_size is not None: + hadamard_block_size = _compute_hadamard_block_size(moe_intermediate_size, tp_size) + quantization_config["hadamard_block_size"] = hadamard_block_size + else: + quantization_config["hadamard_block_size"] = 512 quantization_config["quantization"] = "mix_quant" quant_config_name = "mix_quant" else: diff --git a/tests/ci_use/EB_Lite_with_w4afp8/test_ernie_4_5_w4afp8.py b/tests/ci_use/w4afp8/test_moe_w4afp8_online_quant.py similarity index 98% rename from tests/ci_use/EB_Lite_with_w4afp8/test_ernie_4_5_w4afp8.py rename to tests/ci_use/w4afp8/test_moe_w4afp8_online_quant.py index 5f3f5b693ca..2c25970db3d 100644 --- a/tests/ci_use/EB_Lite_with_w4afp8/test_ernie_4_5_w4afp8.py +++ b/tests/ci_use/w4afp8/test_moe_w4afp8_online_quant.py @@ -49,6 +49,12 @@ "model_name": "ERNIE-4.5-21B-A3B-PT", "model_subdir": "torch", }, + { + "id": "w4afp8_default_v1", + "load_choices": "default_v1", + "model_name": "Qwen3-30B-A3B", + "model_subdir": "torch", + }, ]