Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions benchmarks/yaml/eb45-32k-wint4-tp4_decode_router.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
max_model_len: 32768
max_num_seqs: 256
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.8
tensor_parallel_size: 4
cache_queue_port: 55663
enable_chunked_prefill: True
splitwise_role: decode
engine_worker_queue_port: 6678
cache_transfer_protocol: "rdma,ipc"
rdma_comm_ports: "7671,7672,7673,7674"
pd_comm_port: "2334"
max_num_batched_tokens: 384
max_num_partial_prefills: 3
max_long_partial_prefills: 3
quantization: wint4
13 changes: 13 additions & 0 deletions benchmarks/yaml/eb45-32k-wint4-tp4_prefill_router.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
max_model_len: 32768
max_num_seqs: 16
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.9
tensor_parallel_size: 4
splitwise_role: prefill
enable_prefix_caching: True
cache_queue_port: 55664
engine_worker_queue_port: 6677
cache_transfer_protocol: "rdma,ipc"
rdma_comm_ports: "7675,7676,7677,7678"
pd_comm_port: "2333"
quantization: wint4
2 changes: 1 addition & 1 deletion fastdeploy/worker/gpu_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def determine_available_memory(self) -> int:
paddle_allocated_mem_after_run = paddle.device.cuda.max_memory_allocated(local_rank)

model_block_memory_used = self.cal_theortical_kvcache()
paddle_peak_increase = paddle_allocated_mem_after_run - paddle_allocated_mem_before_run
paddle_peak_increase = paddle_reserved_mem_after_run - paddle_allocated_mem_before_run

paddle.device.cuda.empty_cache()

Expand Down
Loading