ST-XX · pull · Dec 24, 2025 · Dec 24, 2025 · Dec 24, 2025 · Dec 24, 2025
diff --git a/.github/workflows/_base_test.yml b/.github/workflows/_base_test.yml
@@ -233,14 +233,28 @@ jobs:
 
           curl -X POST http://0.0.0.0:${FLASK_PORT}/switch \
             -H "Content-Type: application/json" \
-            -d "{\"--model\": \"/MODELDATA/ernie-4_5-21b-a3b-bf16-paddle\", \"--config\": \"ernie45t_21b_sot.yaml\", \"--enable-logprob\": \"False\"}"
+            -d "{\"--model\": \"/MODELDATA/ernie-4_5-21b-a3b-bf16-paddle\", \"--config\": \"ernie45t_21b_sot_wint4.yaml\", \"--enable-logprob\": \"False\"}"
           check_service 360
           export TEMPLATE=TOKEN_NORMAL
           python -m pytest -sv test_seed_usage.py -k "not test_seed_stream" || TEST_EXIT_CODE=1
 
           curl -X POST http://0.0.0.0:${FLASK_PORT}/switch \
             -H "Content-Type: application/json" \
-            -d "{\"--model\": \"/MODELDATA/ernie-4_5-21b-a3b-bf16-paddle\", \"--config\": \"ernie45t_21b_cinn.yaml\", \"--enable-logprob\": \"False\"}"
+            -d "{\"--model\": \"/MODELDATA/ernie-4_5-21b-a3b-bf16-paddle\", \"--config\": \"ernie45t_21b_cinn_wint4.yaml\", \"--enable-logprob\": \"False\"}"
+          check_service 360
+          export TEMPLATE=TOKEN_NORMAL
+          python -m pytest -sv test_seed_usage.py -k "not test_seed_stream" || TEST_EXIT_CODE=1
+
+          curl -X POST http://0.0.0.0:${FLASK_PORT}/switch \
+            -H "Content-Type: application/json" \
+            -d "{\"--model\": \"/MODELDATA/ernie-4_5-21b-a3b-bf16-paddle\", \"--config\": \"ernie45t_21b_sot_fp8.yaml\", \"--enable-logprob\": \"False\"}"
+          check_service 360
+          export TEMPLATE=TOKEN_NORMAL
+          python -m pytest -sv test_seed_usage.py -k "not test_seed_stream" || TEST_EXIT_CODE=1
+
+          curl -X POST http://0.0.0.0:${FLASK_PORT}/switch \
+            -H "Content-Type: application/json" \
+            -d "{\"--model\": \"/MODELDATA/ernie-4_5-21b-a3b-bf16-paddle\", \"--config\": \"ernie45t_21b_cinn_fp8.yaml\", \"--enable-logprob\": \"False\"}"
           check_service 360
           export TEMPLATE=TOKEN_NORMAL
           python -m pytest -sv test_seed_usage.py -k "not test_seed_stream" || TEST_EXIT_CODE=1

diff --git a/custom_ops/gpu_ops/update_inputs_v1.cu b/custom_ops/gpu_ops/update_inputs_v1.cu
@@ -50,6 +50,11 @@ __global__ void update_inputs_kernel_v1(bool* not_need_stop,
   }
   if (thread_idx < bsz) {
     if (stop_flag_now) {
+      // chuned when max_tokens=1
+      if (seq_lens_this_time[thread_idx] + seq_lens_decoder[thread_idx] <
+          prompt_lens[thread_idx]) {
+        topk_ids[thread_idx] = -1;
+      }
       seq_lens_this_time[thread_idx] = 0;  // stop at next step
       seq_lens_decoder[thread_idx] = 0;
       seq_lens_encoder[thread_idx] = 0;

diff --git a/fastdeploy/distributed/communication.py b/fastdeploy/distributed/communication.py
@@ -20,6 +20,8 @@
 import paddle.distributed as dist
 from paddle.distributed import fleet
 
+from fastdeploy.utils import register_custom_python_op
+
 _TP_AR = None
 
 
@@ -50,7 +52,18 @@ def custom_ar_clear_ipc_handles():
 
 try:
 
-    @paddle.jit.marker.unified
+    def tensor_model_parallel_all_reduce_infer_meta(x: "paddle.static.MetaTensor", group_) -> paddle.static.MetaTensor:
+        return paddle.static.MetaTensor(shape=x.shape, dtype=x.dtype)
+
+    @register_custom_python_op(
+        name="tensor_model_parallel_all_reduce",
+        infer_meta=tensor_model_parallel_all_reduce_infer_meta,
+        input_names=[
+            "input_",
+        ],
+        output_names=["out"],
+        inplace_map={},
+    )
     def tensor_model_parallel_all_reduce(
         input_: paddle.Tensor,
         group_: paddle.distributed.communication.group.Group = None,