PaddlePaddle · yangjianfengo1 · Sep 25, 2025 · Sep 26, 2025 · Sep 26, 2025 · Sep 26, 2025
diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -249,6 +249,7 @@ paddle::Tensor MoeExpertFFNFunc(
     const paddle::Tensor& permute_input,
     const paddle::Tensor& tokens_expert_prefix_sum,
     const paddle::Tensor& up_gate_proj_weight, const paddle::Tensor& down_proj_weight,
+    const paddle::optional<paddle::Tensor>& up_proj_in_scale,
     const paddle::optional<paddle::Tensor>& up_gate_proj_bias,
     const paddle::optional<paddle::Tensor>& up_gate_proj_scale,
     const paddle::optional<paddle::Tensor>& down_proj_scale,

diff --git a/...m_ops/gpu_ops/moe/fast_hardamard_kernel.h → ...moe/fast_hardmard/fast_hardamard_kernel.h b/...m_ops/gpu_ops/moe/fast_hardamard_kernel.h → ...moe/fast_hardmard/fast_hardamard_kernel.h
diff --git a/..._ops/gpu_ops/moe/fast_hardamard_kernel.cu → ...e/fast_hardmard/fast_hardamard_kernel.hpp b/..._ops/gpu_ops/moe/fast_hardamard_kernel.cu → ...e/fast_hardmard/fast_hardamard_kernel.hpp
@@ -974,79 +974,3 @@ void MoeFastHardamardWrapper(const T *x_data,
     }
   }
 }
-
-template void MoeFastHardamardWrapper<phi::dtype::float16, phi::dtype::float16>(
-  const phi::dtype::float16 *x_data,
-  const int64_t *expert_idx_per_token,
-  const int64_t *recv_expert_count,
-  const phi::dtype::float16 *shift,
-  const phi::dtype::float16 *smooth,
-  const float* quant_scales,
-  const int quant_round_type,
-  const float quant_max_bound,
-  const float quant_min_bound,
-  const int64_t token_num,
-  const int64_t dim,
-  const int num_max_tokens_per_expert,
-  bool used_in_ep_low_latency,
-  const int hadamard_block_size,
-  phi::dtype::float16 *out,
-  cudaStream_t &stream
-);
-
-template void MoeFastHardamardWrapper<phi::dtype::float16, int8_t>(
-  const phi::dtype::float16 *x_data,
-  const int64_t *expert_idx_per_token,
-  const int64_t *recv_expert_count,
-  const phi::dtype::float16 *shift,
-  const phi::dtype::float16 *smooth,
-  const float* quant_scales,
-  const int quant_round_type,
-  const float quant_max_bound,
-  const float quant_min_bound,
-  const int64_t token_num,
-  const int64_t dim,
-  const int num_max_tokens_per_expert,
-  bool used_in_ep_low_latency,
-  const int hadamard_block_size,
-  int8_t *out,
-  cudaStream_t &stream
-);
-
-template void MoeFastHardamardWrapper<phi::dtype::bfloat16, phi::dtype::bfloat16>(
-  const phi::dtype::bfloat16 *x_data,
-  const int64_t *expert_idx_per_token,
-  const int64_t *recv_expert_count,
-  const phi::dtype::bfloat16 *shift,
-  const phi::dtype::bfloat16 *smooth,
-  const float* quant_scales,
-  const int quant_round_type,
-  const float quant_max_bound,
-  const float quant_min_bound,
-  const int64_t token_num,
-  const int64_t dim,
-  const int num_max_tokens_per_expert,
-  bool used_in_ep_low_latency,
-  const int hadamard_block_size,
-  phi::dtype::bfloat16 *out,
-  cudaStream_t &stream
-);
-
-template void MoeFastHardamardWrapper<phi::dtype::bfloat16, int8_t>(
-  const phi::dtype::bfloat16 *x_data,
-  const int64_t *expert_idx_per_token,
-  const int64_t *recv_expert_count,
-  const phi::dtype::bfloat16 *shift,
-  const phi::dtype::bfloat16 *smooth,
-  const float* quant_scales,
-  const int quant_round_type,
-  const float quant_max_bound,
-  const float quant_min_bound,
-  const int64_t token_num,
-  const int64_t dim,
-  const int num_max_tokens_per_expert,
-  bool used_in_ep_low_latency,
-  const int hadamard_block_size,
-  int8_t *out,
-  cudaStream_t &stream
-);
diff --git a/custom_ops/gpu_ops/moe/fast_hardmard/fast_hardamard_kernel_bf16_bf16.cu b/custom_ops/gpu_ops/moe/fast_hardmard/fast_hardamard_kernel_bf16_bf16.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fast_hardamard_kernel.hpp"
+
+template void MoeFastHardamardWrapper<phi::dtype::bfloat16, phi::dtype::bfloat16>(
+  const phi::dtype::bfloat16 *x_data,
+  const int64_t *expert_idx_per_token,
+  const int64_t *recv_expert_count,
+  const phi::dtype::bfloat16 *shift,
+  const phi::dtype::bfloat16 *smooth,
+  const float* quant_scales,
+  const int quant_round_type,
+  const float quant_max_bound,
+  const float quant_min_bound,
+  const int64_t token_num,
+  const int64_t dim,
+  const int num_max_tokens_per_expert,
+  bool used_in_ep_low_latency,
+  const int hadamard_block_size,
+  phi::dtype::bfloat16 *out,
+  cudaStream_t &stream
+);
diff --git a/custom_ops/gpu_ops/moe/fast_hardmard/fast_hardamard_kernel_bf16_fp8.cu b/custom_ops/gpu_ops/moe/fast_hardmard/fast_hardamard_kernel_bf16_fp8.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fast_hardamard_kernel.hpp"
+
+template void MoeFastHardamardWrapper<phi::dtype::bfloat16, phi::dtype::float8_e4m3fn>(
+  const phi::dtype::bfloat16 *x_data,
+  const int64_t *expert_idx_per_token,
+  const int64_t *recv_expert_count,
+  const phi::dtype::bfloat16 *shift,
+  const phi::dtype::bfloat16 *smooth,
+  const float* quant_scales,
+  const int quant_round_type,
+  const float quant_max_bound,
+  const float quant_min_bound,
+  const int64_t token_num,
+  const int64_t dim,
+  const int num_max_tokens_per_expert,
+  bool used_in_ep_low_latency,
+  const int hadamard_block_size,
+  phi::dtype::float8_e4m3fn *out,
+  cudaStream_t &stream
+);
diff --git a/custom_ops/gpu_ops/moe/fast_hardmard/fast_hardamard_kernel_bf16_int8.cu b/custom_ops/gpu_ops/moe/fast_hardmard/fast_hardamard_kernel_bf16_int8.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fast_hardamard_kernel.hpp"
+
+template void MoeFastHardamardWrapper<phi::dtype::bfloat16, int8_t>(
+  const phi::dtype::bfloat16 *x_data,
+  const int64_t *expert_idx_per_token,
+  const int64_t *recv_expert_count,
+  const phi::dtype::bfloat16 *shift,
+  const phi::dtype::bfloat16 *smooth,
+  const float* quant_scales,
+  const int quant_round_type,
+  const float quant_max_bound,
+  const float quant_min_bound,
+  const int64_t token_num,
+  const int64_t dim,
+  const int num_max_tokens_per_expert,
+  bool used_in_ep_low_latency,
+  const int hadamard_block_size,
+  int8_t *out,
+  cudaStream_t &stream
+);
diff --git a/custom_ops/gpu_ops/moe/fast_hardmard/fast_hardamard_kernel_fp16_fp16.cu b/custom_ops/gpu_ops/moe/fast_hardmard/fast_hardamard_kernel_fp16_fp16.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fast_hardamard_kernel.hpp"
+
+template void MoeFastHardamardWrapper<phi::dtype::float16, phi::dtype::float16>(
+  const phi::dtype::float16 *x_data,
+  const int64_t *expert_idx_per_token,
+  const int64_t *recv_expert_count,
+  const phi::dtype::float16 *shift,
+  const phi::dtype::float16 *smooth,
+  const float* quant_scales,
+  const int quant_round_type,
+  const float quant_max_bound,
+  const float quant_min_bound,
+  const int64_t token_num,
+  const int64_t dim,
+  const int num_max_tokens_per_expert,
+  bool used_in_ep_low_latency,
+  const int hadamard_block_size,
+  phi::dtype::float16 *out,
+  cudaStream_t &stream
+);
diff --git a/custom_ops/gpu_ops/moe/fast_hardmard/fast_hardamard_kernel_fp16_int8.cu b/custom_ops/gpu_ops/moe/fast_hardmard/fast_hardamard_kernel_fp16_int8.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fast_hardamard_kernel.hpp"
+
+template void MoeFastHardamardWrapper<phi::dtype::float16, int8_t>(
+  const phi::dtype::float16 *x_data,
+  const int64_t *expert_idx_per_token,
+  const int64_t *recv_expert_count,
+  const phi::dtype::float16 *shift,
+  const phi::dtype::float16 *smooth,
+  const float* quant_scales,
+  const int quant_round_type,
+  const float quant_max_bound,
+  const float quant_min_bound,
+  const int64_t token_num,
+  const int64_t dim,
+  const int num_max_tokens_per_expert,
+  bool used_in_ep_low_latency,
+  const int hadamard_block_size,
+  int8_t *out,
+  cudaStream_t &stream
+);
diff --git a/custom_ops/gpu_ops/moe/fused_moe_helper.h b/custom_ops/gpu_ops/moe/fused_moe_helper.h
@@ -250,7 +250,7 @@ template <typename T, typename NvType> class MoeHelper {
 
     initialize_moe_routing_kernelLauncher(
         input_activations, permuted_data_, permuted_rows_, nullptr, nullptr,
-        expanded_source_row_to_expanded_dest_row, num_rows, num_rows,
+        expanded_source_row_to_expanded_dest_row, nullptr, num_rows, num_rows,
         hidden_size, k, stream);
 
     const int64_t expanded_active_expert_rows = k * num_rows;