diff --git a/.github/workflows/ci_image_update.yml b/.github/workflows/ci_image_update.yml index 7e6544e6364..da1256e204c 100644 --- a/.github/workflows/ci_image_update.yml +++ b/.github/workflows/ci_image_update.yml @@ -137,10 +137,19 @@ jobs: FASTDEPLOY_WHEEL_URL: ${{ needs.build_sm8090.outputs.wheel_path }} MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData" + stable_test: + name: Run Stable Tests + needs: [clone,build_sm8090,ci_image_build] + uses: ./.github/workflows/_stable_test.yml + with: + DOCKER_IMAGE: ${{ needs.ci_image_build.outputs.docker_name_precheck }} + FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }} + FASTDEPLOY_WHEEL_URL: ${{ needs.build_sm8090.outputs.wheel_path }} + MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData" publish_pre_check: name: Publish Docker Images Pre Check - needs: [ci_image_build, unittest_coverage,logprob_test,pre_ce_test,base_test] + needs: [ci_image_build,unittest_coverage,logprob_test,pre_ce_test,base_test,stable_test] runs-on: [self-hosted, Docker-Build] steps: - name: Images Uploading diff --git a/.github/workflows/pr_build_and_test.yml b/.github/workflows/pr_build_and_test.yml index 5abd24966d8..da1630e07cf 100644 --- a/.github/workflows/pr_build_and_test.yml +++ b/.github/workflows/pr_build_and_test.yml @@ -75,3 +75,13 @@ jobs: FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }} FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }} MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData" + + stable_test: + name: Run Stable Tests + needs: [clone,build] + uses: ./.github/workflows/_stable_test.yml + with: + DOCKER_IMAGE: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddleqa:fastdeploy-ciuse-cuda126-dailyupdate + FASTDEPLOY_ARCHIVE_URL: ${{ needs.clone.outputs.repo_archive_url }} + FASTDEPLOY_WHEEL_URL: ${{ needs.build.outputs.wheel_path }} + MODEL_CACHE_DIR: "/ssd2/actions-runner/ModelData" diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 4c3f262ff28..9170dfb4c8a 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -1789,6 +1789,12 @@ def postprocess(self): # It will hang when real batch_size < tp_size self.graph_opt_config.filter_capture_size(tp_size=self.parallel_config.tensor_parallel_size) + # adjust speculative config + if self.speculative_config is not None and self.speculative_config.method == "mtp": + if self.scheduler_config.splitwise_role == "prefill": + self.speculative_config.num_speculative_tokens = 1 + self.speculative_config.num_model_steps = 1 + if self.scheduler_config.splitwise_role == "mixed": self._disable_sequence_parallel_moe_if_needed("Mixed") self.model_config.moe_phase = MoEPhase(phase="prefill") diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index d14ad8897c4..96c4c57f5c7 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -240,7 +240,7 @@ class EngineArgs: """ Flag to enable prefix caching. """ - enable_output_caching: bool = True + enable_output_caching: bool = False """ Flag to enable kv cache for output tokens, only valid in V1 scheduler. """ diff --git a/tests/ce/stable_cases/launch_model.sh b/tests/ce/stable_cases/launch_model.sh index 3b758a15a2a..570b37d6569 100644 --- a/tests/ce/stable_cases/launch_model.sh +++ b/tests/ce/stable_cases/launch_model.sh @@ -1,9 +1,9 @@ #!/bin/bash MODEL_PATH="${1}/TP2" -FD_API_PORT=${FD_API_PORT:-8000} -FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT:-8001} -FD_METRICS_PORT=${FD_METRICS_PORT:-8002} -FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT:-8003} +FD_API_PORT=${FD_API_PORT:-8180} +FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT:-8181} +FD_METRICS_PORT=${FD_METRICS_PORT:-8182} +FD_CACHE_QUEUE_PORT=${FD_CACHE_QUEUE_PORT:-8183} @@ -36,7 +36,6 @@ python -m fastdeploy.entrypoints.openai.api_server \ --engine-worker-queue-port ${FD_ENGINE_QUEUE_PORT} \ --metrics-port ${FD_METRICS_PORT} \ --cache-queue-port ${FD_CACHE_QUEUE_PORT} \ - --quantization wint8 \ --max-model-len 32768 \ --max-num-seqs 1 \ --gpu-memory-utilization 0.9 \