Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions examples/splitwise/start_v0_tp1.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@ if [ -z "${KVCACHE_RDMA_NICS}" ]; then
fi

unset http_proxy && unset https_proxy
rm -rf log_*
source ./utils.sh
source ${SCRIPT_DIR}/utils.sh

P_PORT=52400
D_PORT=52500
REDIS_PORT="${REDIS_PORT:-56388}"
REDIS_PORT="${REDIS_PORT:-6379}"
LOG_DATE=$(date +%Y%m%d_%H%M%S)

ports=(
$P_PORT $((P_PORT + 1)) $((P_PORT + 2)) $((P_PORT + 3)) $((P_PORT + 4)) $((P_PORT + 5))
Expand All @@ -51,8 +51,8 @@ sleep 1

# start prefill
export CUDA_VISIBLE_DEVICES=0
export FD_LOG_DIR="log_prefill"
mkdir -p ${FD_LOG_DIR}
export FD_LOG_DIR="log/$LOG_DATE/prefill"
rm -rf ${FD_LOG_DIR} && mkdir -p ${FD_LOG_DIR}

nohup python -m fastdeploy.entrypoints.openai.api_server \
--model ${MODEL_NAME} \
Expand All @@ -76,8 +76,8 @@ wait_for_health ${P_PORT}

# start decode
export CUDA_VISIBLE_DEVICES=1
export FD_LOG_DIR="log_decode"
mkdir -p ${FD_LOG_DIR}
export FD_LOG_DIR="log/$LOG_DATE/decode"
rm -rf ${FD_LOG_DIR} && mkdir -p ${FD_LOG_DIR}

nohup python -m fastdeploy.entrypoints.openai.api_server \
--model ${MODEL_NAME} \
Expand Down
51 changes: 2 additions & 49 deletions examples/splitwise/start_v1_dp2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,29 +9,19 @@ set -e
MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
DATA_PARALLEL_SIZE=2
TENSOR_PARALLEL_SIZE=1
NUM_GPUS=$(($DATA_PARALLEL_SIZE * $TENSOR_PARALLEL_SIZE))
LOG_DATE=$(date +%Y%m%d_%H%M%S)

export FD_DEBUG=1
export ENABLE_V1_KVCACHE_SCHEDULER=1
export KVCACHE_GDRCOPY_FLUSH_ENABLE=1
export FD_ENABLE_MULTI_API_SERVER=1

SCRIPT_PATH=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
export $(bash ${SCRIPT_DIR}/../../scripts/get_rdma_nics.sh gpu)
echo "KVCACHE_RDMA_NICS:${KVCACHE_RDMA_NICS}"
if [ -z "${KVCACHE_RDMA_NICS}" ]; then
echo "KVCACHE_RDMA_NICS is empty, please check the output of get_rdma_nics.sh"
exit 1
fi
source ${SCRIPT_DIR}/utils.sh

unset http_proxy && unset https_proxy
source ${SCRIPT_DIR}/utils.sh

# start router
ROUTER_PORT=$(get_free_ports 1)
echo "---------------------------"
echo ROUTER_PORT: $ROUTER_PORT

export FD_LOG_DIR="log/$LOG_DATE/router"
Expand All @@ -47,18 +37,7 @@ sleep 1

# start prefill
P_SERVER_PORTS=$(get_free_ports $DATA_PARALLEL_SIZE)
P_METRICS_PORTS=$(get_free_ports $DATA_PARALLEL_SIZE)
P_ENGINE_WORKER_QUEUE_PORTS=$(get_free_ports $DATA_PARALLEL_SIZE)
P_CACHE_QUEUE_PORTS=$(get_free_ports $DATA_PARALLEL_SIZE)
P_RDMA_COMM_PORTS=$(get_free_ports $NUM_GPUS)
P_PD_COMM_PORTS=$(get_free_ports $DATA_PARALLEL_SIZE)
echo "---------------------------"
echo P_SERVER_PORTS: $P_SERVER_PORTS
echo P_METRICS_PORTS: $P_METRICS_PORTS
echo P_ENGINE_WORKER_QUEUE_PORTS: $P_ENGINE_WORKER_QUEUE_PORTS
echo P_CACHE_QUEUE_PORTS: $P_CACHE_QUEUE_PORTS
echo P_RDMA_COMM_PORTS: $P_RDMA_COMM_PORTS
echo P_PD_COMM_PORTS: $P_PD_COMM_PORTS

export CUDA_VISIBLE_DEVICES="0,1"
export FD_LOG_DIR="log/$LOG_DATE/prefill"
Expand All @@ -68,67 +47,41 @@ mkdir -p ${FD_LOG_DIR}
nohup python -m fastdeploy.entrypoints.openai.multi_api_server \
--num-servers ${DATA_PARALLEL_SIZE}\
--ports ${P_SERVER_PORTS} \
--metrics-port ${P_METRICS_PORTS} \
--args --model ${MODEL_NAME} \
--engine-worker-queue-port ${P_ENGINE_WORKER_QUEUE_PORTS} \
--cache-queue-port ${P_CACHE_QUEUE_PORTS} \
--max-model-len 32768 \
--data-parallel-size ${DATA_PARALLEL_SIZE} \
--tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
--splitwise-role "prefill" \
--cache-transfer-protocol "rdma" \
--rdma-comm-ports ${P_RDMA_COMM_PORTS} \
--pd-comm-port ${P_PD_COMM_PORTS} \
--router "0.0.0.0:${ROUTER_PORT}" \
2>&1 >${FD_LOG_DIR}/nohup &

echo "--- Health Check Status ---"
wait_for_health ${P_SERVER_PORTS}


# start decode
D_SERVER_PORTS=$(get_free_ports $DATA_PARALLEL_SIZE)
D_ENGINE_WORKER_QUEUE_PORTS=$(get_free_ports $DATA_PARALLEL_SIZE)
D_CACHE_QUEUE_PORTS=$(get_free_ports $DATA_PARALLEL_SIZE)
D_METRICS_PORTS=$(get_free_ports $DATA_PARALLEL_SIZE)
D_RDMA_COMM_PORTS=$(get_free_ports $NUM_GPUS)
D_PD_COMM_PORTS=$(get_free_ports $DATA_PARALLEL_SIZE)
echo "---------------------------"
echo D_SERVER_PORTS: $D_SERVER_PORTS
echo D_ENGINE_WORKER_QUEUE_PORTS: $D_ENGINE_WORKER_QUEUE_PORTS
echo D_CACHE_QUEUE_PORTS: $D_CACHE_QUEUE_PORTS
echo D_METRICS_PORTS: $D_METRICS_PORTS
echo D_RDMA_COMM_PORTS: $D_RDMA_COMM_PORTS
echo D_PD_COMM_PORTS: $D_PD_COMM_PORTS

export CUDA_VISIBLE_DEVICES="2,3"
export CUDA_VISIBLE_DEVICES="4,5"
export FD_LOG_DIR="log/$LOG_DATE/decode"
rm -rf $FD_LOG_DIR
mkdir -p ${FD_LOG_DIR}

nohup python -m fastdeploy.entrypoints.openai.multi_api_server \
--num-servers ${DATA_PARALLEL_SIZE}\
--ports ${D_SERVER_PORTS} \
--metrics-port ${D_METRICS_PORTS} \
--args --model ${MODEL_NAME} \
--engine-worker-queue-port ${D_ENGINE_WORKER_QUEUE_PORTS} \
--cache-queue-port ${D_CACHE_QUEUE_PORTS} \
--max-model-len 32768 \
--data-parallel-size ${DATA_PARALLEL_SIZE} \
--tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
--splitwise-role "decode" \
--cache-transfer-protocol "rdma" \
--rdma-comm-ports ${D_RDMA_COMM_PORTS} \
--pd-comm-port ${D_PD_COMM_PORTS} \
--router "0.0.0.0:${ROUTER_PORT}" \
2>&1 >${FD_LOG_DIR}/nohup &

echo "--- Health Check Status ---"
wait_for_health ${D_SERVER_PORTS}


# send request
echo "------ Request Check ------"
sleep 10 # make sure server is registered to router
curl -X POST "http://0.0.0.0:${ROUTER_PORT}/v1/chat/completions" \
-H "Content-Type: application/json" \
Expand Down
64 changes: 19 additions & 45 deletions examples/splitwise/start_v1_tp1.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,39 +9,27 @@ set -e
# prepare environment
export MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
export FD_DEBUG=1
export ENABLE_V1_KVCACHE_SCHEDULER=1
export KVCACHE_GDRCOPY_FLUSH_ENABLE=1

SCRIPT_PATH=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
export $(bash ${SCRIPT_DIR}/../../scripts/get_rdma_nics.sh gpu)
echo "KVCACHE_RDMA_NICS:${KVCACHE_RDMA_NICS}"
if [ -z "${KVCACHE_RDMA_NICS}" ]; then
echo "KVCACHE_RDMA_NICS is empty, please check the output of get_rdma_nics.sh"
exit 1
fi
source ${SCRIPT_DIR}/utils.sh

unset http_proxy && unset https_proxy
rm -rf log_*
source ./utils.sh

P_PORT=52400
D_PORT=52500
ROUTER_PORT=52700
LOG_DATE=$(date +%Y%m%d_%H%M%S)

ports=(
$P_PORT $((P_PORT + 1)) $((P_PORT + 2)) $((P_PORT + 3)) $((P_PORT + 4)) $((P_PORT + 5))
$D_PORT $((D_PORT + 1)) $((D_PORT + 2)) $((D_PORT + 3)) $((D_PORT + 4)) $((D_PORT + 5))
$ROUTER_PORT
)
ports=($P_PORT $D_PORT $ROUTER_PORT)
check_ports "${ports[@]}" || {
echo "❌ Some ports are in use. Please release them."
exit 1
}

# start router
export FD_LOG_DIR="log_router"
mkdir -p ${FD_LOG_DIR}
export FD_LOG_DIR="log/$LOG_DATE/router"
rm -rf ${FD_LOG_DIR} && mkdir -p ${FD_LOG_DIR}

nohup python -m fastdeploy.router.launch \
--port ${ROUTER_PORT} \
Expand All @@ -50,43 +38,29 @@ nohup python -m fastdeploy.router.launch \

# start prefill
export CUDA_VISIBLE_DEVICES=0
export FD_LOG_DIR="log_prefill"
mkdir -p ${FD_LOG_DIR}
export FD_LOG_DIR="log/$LOG_DATE/prefill"
rm -rf ${FD_LOG_DIR} && mkdir -p ${FD_LOG_DIR}

nohup python -m fastdeploy.entrypoints.openai.api_server \
--model ${MODEL_NAME} \
--port "${P_PORT}" \
--metrics-port "$((P_PORT + 1))" \
--engine-worker-queue-port "$((P_PORT + 2))" \
--cache-queue-port "$((P_PORT + 3))" \
--max-model-len 32768 \
--splitwise-role "prefill" \
--cache-transfer-protocol "rdma" \
--rdma-comm-ports "$((P_PORT + 4))" \
--pd-comm-port "$((P_PORT + 5))" \
--router "0.0.0.0:${ROUTER_PORT}" \
2>&1 >${FD_LOG_DIR}/nohup &
--model ${MODEL_NAME} \
--port "${P_PORT}" \
--splitwise-role "prefill" \
--router "0.0.0.0:${ROUTER_PORT}" \
2>&1 >${FD_LOG_DIR}/nohup &

wait_for_health ${P_PORT}

# start decode
export CUDA_VISIBLE_DEVICES=1
export FD_LOG_DIR="log_decode"
mkdir -p ${FD_LOG_DIR}
export FD_LOG_DIR="log/$LOG_DATE/decode"
rm -rf ${FD_LOG_DIR} && mkdir -p ${FD_LOG_DIR}

nohup python -m fastdeploy.entrypoints.openai.api_server \
--model ${MODEL_NAME} \
--port "${D_PORT}" \
--metrics-port "$((D_PORT + 2))" \
--engine-worker-queue-port "$((D_PORT + 3))" \
--cache-queue-port "$((D_PORT + 1))" \
--max-model-len 32768 \
--splitwise-role "decode" \
--cache-transfer-protocol "rdma" \
--rdma-comm-ports "$((D_PORT + 4))" \
--pd-comm-port "$((D_PORT + 5))" \
--router "0.0.0.0:${ROUTER_PORT}" \
2>&1 >${FD_LOG_DIR}/nohup &
--model ${MODEL_NAME} \
--port "${D_PORT}" \
--splitwise-role "decode" \
--router "0.0.0.0:${ROUTER_PORT}" \
2>&1 >${FD_LOG_DIR}/nohup &

wait_for_health ${D_PORT}

Expand Down
82 changes: 82 additions & 0 deletions examples/splitwise/start_v1_tp2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#!/bin/bash
set -e

# Test splitwise deployment
# There are two methods for splitwise deployment:
# v0: using splitwise_scheduler or dp_scheduler
# v1: using local_scheduler + router

# prepare environment
export MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
export FD_DEBUG=1
export ENABLE_V1_KVCACHE_SCHEDULER=1
export KVCACHE_GDRCOPY_FLUSH_ENABLE=1

SCRIPT_PATH=$(readlink -f "$0")
SCRIPT_DIR=$(dirname "$SCRIPT_PATH")
source ${SCRIPT_DIR}/utils.sh

unset http_proxy && unset https_proxy

P_PORT=52400
D_PORT=52500
ROUTER_PORT=52700
LOG_DATE=$(date +%Y%m%d_%H%M%S)

ports=($P_PORT $D_PORT $ROUTER_PORT)
check_ports "${ports[@]}" || {
echo "❌ Some ports are in use. Please release them."
exit 1
}

# start router
export FD_LOG_DIR="log/$LOG_DATE/router"
rm -rf ${FD_LOG_DIR} && mkdir -p ${FD_LOG_DIR}

nohup python -m fastdeploy.router.launch \
--port ${ROUTER_PORT} \
--splitwise \
2>&1 >${FD_LOG_DIR}/nohup &

# start prefill
export CUDA_VISIBLE_DEVICES=0,1
export FD_LOG_DIR="log/$LOG_DATE/prefill"
rm -rf ${FD_LOG_DIR} && mkdir -p ${FD_LOG_DIR}

nohup python -m fastdeploy.entrypoints.openai.api_server \
--model ${MODEL_NAME} \
--port "${P_PORT}" \
--tensor-parallel-size 2 \
--splitwise-role "prefill" \
--router "0.0.0.0:${ROUTER_PORT}" \
2>&1 >${FD_LOG_DIR}/nohup &

wait_for_health ${P_PORT}

# start decode
export CUDA_VISIBLE_DEVICES=2,3
export FD_LOG_DIR="log/$LOG_DATE/decode"
rm -rf ${FD_LOG_DIR} && mkdir -p ${FD_LOG_DIR}

nohup python -m fastdeploy.entrypoints.openai.api_server \
--model ${MODEL_NAME} \
--port "${D_PORT}" \
--tensor-parallel-size 2 \
--splitwise-role "decode" \
--router "0.0.0.0:${ROUTER_PORT}" \
2>&1 >${FD_LOG_DIR}/nohup &

wait_for_health ${D_PORT}

# send request
sleep 10 # make sure server is registered to router
echo "send request..."
curl -X POST "http://0.0.0.0:${ROUTER_PORT}/v1/chat/completions" \
-H "Content-Type: application/json" \
-d '{
"messages": [
{"role": "user", "content": "hello"}
],
"max_tokens": 100,
"stream": false
}'
6 changes: 4 additions & 2 deletions examples/splitwise/utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

is_port_free() {
local port=$1
if ss -ltn | awk '{print $4}' | grep -q ":${port}$"; then
if ss -ltun | awk '{print $4}' | grep -q ":${port}$"; then
return 1 # Port is occupied
fi
return 0 # Port is free
Expand All @@ -28,6 +28,7 @@ wait_for_health() {
local NC='\033[0m' # No Color
local start_time=$(date +%s)

echo "-------- WAIT FOR HEALTH --------"
while true; do
local all_ready=true
for port in "${server_ports[@]}"; do
Expand All @@ -44,11 +45,12 @@ wait_for_health() {
echo "All services are ready! [$((cur_time-start_time))s]"
break
else
echo "Waiting for services... [$((cur_time-start_time))s]"
echo "Services not ready.. [$((cur_time-start_time))s]"
printf "\033[%dA" "$total_lines" # roll back cursor
sleep 1
fi
done
echo "---------------------------------"
}

get_free_ports() {
Expand Down
4 changes: 2 additions & 2 deletions fastdeploy/cache_manager/cache_messager.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def parse_args():
parser.add_argument("--value_cache_shape", type=str, default="", help="value cache shape")
parser.add_argument("--rdma_port", type=str, default="", help="rmda port")
parser.add_argument("--mp_num", type=int, default=1, help="number of model parallel, i.e. tp_size, tp_num")
parser.add_argument("--engine_pid", type=str, default=None, help="engine pid")
parser.add_argument("--ipc_suffix", type=str, default=None, help="ipc suffix")
parser.add_argument(
"--protocol",
type=str,
Expand Down Expand Up @@ -945,7 +945,7 @@ def main():
name="cache_ready_signal",
array=cache_ready_signal_data,
dtype=np.int32,
suffix=args.engine_pid,
suffix=args.ipc_suffix,
create=False,
)
cache_ready_signal.value[rank] = 1
Expand Down
Loading
Loading