From 21c5ca82271db32184a827c7386fa7622a0ea3d0 Mon Sep 17 00:00:00 2001 From: xiegegege Date: Wed, 3 Dec 2025 16:44:50 +0800 Subject: [PATCH 1/2] fix deepseek --- fastdeploy/worker/gpu_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastdeploy/worker/gpu_worker.py b/fastdeploy/worker/gpu_worker.py index 9fcf9efcc9a..7af9c808d42 100644 --- a/fastdeploy/worker/gpu_worker.py +++ b/fastdeploy/worker/gpu_worker.py @@ -141,7 +141,7 @@ def determine_available_memory(self) -> int: paddle_allocated_mem_after_run = paddle.device.cuda.max_memory_allocated(local_rank) model_block_memory_used = self.cal_theortical_kvcache() - paddle_peak_increase = paddle_allocated_mem_after_run - paddle_allocated_mem_before_run + paddle_peak_increase = paddle_reserved_mem_after_run - paddle_allocated_mem_before_run paddle.device.cuda.empty_cache() From 19ac7230f92af0328352c4018aa40a5ee3cc7f87 Mon Sep 17 00:00:00 2001 From: xiegegege Date: Tue, 9 Dec 2025 21:21:30 +0800 Subject: [PATCH 2/2] add fastdeploy benchmark,paddletest-155 --- .../yaml/eb45-32k-wint4-tp4_decode_router.yaml | 16 ++++++++++++++++ .../yaml/eb45-32k-wint4-tp4_prefill_router.yaml | 13 +++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 benchmarks/yaml/eb45-32k-wint4-tp4_decode_router.yaml create mode 100644 benchmarks/yaml/eb45-32k-wint4-tp4_prefill_router.yaml diff --git a/benchmarks/yaml/eb45-32k-wint4-tp4_decode_router.yaml b/benchmarks/yaml/eb45-32k-wint4-tp4_decode_router.yaml new file mode 100644 index 00000000000..34de7cd762f --- /dev/null +++ b/benchmarks/yaml/eb45-32k-wint4-tp4_decode_router.yaml @@ -0,0 +1,16 @@ +max_model_len: 32768 +max_num_seqs: 256 +gpu_memory_utilization: 0.9 +kv_cache_ratio: 0.8 +tensor_parallel_size: 4 +cache_queue_port: 55663 +enable_chunked_prefill: True +splitwise_role: decode +engine_worker_queue_port: 6678 +cache_transfer_protocol: "rdma,ipc" +rdma_comm_ports: "7671,7672,7673,7674" +pd_comm_port: "2334" +max_num_batched_tokens: 384 +max_num_partial_prefills: 3 +max_long_partial_prefills: 3 +quantization: wint4 diff --git a/benchmarks/yaml/eb45-32k-wint4-tp4_prefill_router.yaml b/benchmarks/yaml/eb45-32k-wint4-tp4_prefill_router.yaml new file mode 100644 index 00000000000..cf4b4a51ddb --- /dev/null +++ b/benchmarks/yaml/eb45-32k-wint4-tp4_prefill_router.yaml @@ -0,0 +1,13 @@ +max_model_len: 32768 +max_num_seqs: 16 +gpu_memory_utilization: 0.9 +kv_cache_ratio: 0.9 +tensor_parallel_size: 4 +splitwise_role: prefill +enable_prefix_caching: True +cache_queue_port: 55664 +engine_worker_queue_port: 6677 +cache_transfer_protocol: "rdma,ipc" +rdma_comm_ports: "7675,7676,7677,7678" +pd_comm_port: "2333" +quantization: wint4