From 209bce3c2fca45e192e531dd25941bb0c5019efa Mon Sep 17 00:00:00 2001 From: liyonghua0910 Date: Sat, 6 Dec 2025 13:11:39 +0800 Subject: [PATCH 01/17] [feat] simplify configuration for pd-disaggregated deployment, and refactor post-init and usage for all ports --- examples/splitwise/start_v1_tp1.sh | 28 +-- examples/splitwise/utils.sh | 6 +- fastdeploy/cache_manager/cache_messager.py | 4 +- .../cache_manager/cache_transfer_manager.py | 12 +- .../cache_manager/prefix_cache_manager.py | 16 +- .../transfer_factory/get_rdma_nics.sh | 225 ++++++++++++++++++ .../transfer_factory/rdma_cache_transfer.py | 49 +++- fastdeploy/config.py | 77 +++--- fastdeploy/engine/args_utils.py | 90 ++++--- fastdeploy/engine/common_engine.py | 45 +--- fastdeploy/engine/engine.py | 10 +- fastdeploy/engine/expert_service.py | 11 - fastdeploy/splitwise/splitwise_connector.py | 10 +- fastdeploy/utils.py | 67 +++++- fastdeploy/worker/worker_process.py | 10 +- 15 files changed, 487 insertions(+), 173 deletions(-) create mode 100644 fastdeploy/cache_manager/transfer_factory/get_rdma_nics.sh diff --git a/examples/splitwise/start_v1_tp1.sh b/examples/splitwise/start_v1_tp1.sh index 5b77533fd46..7f5a373303b 100644 --- a/examples/splitwise/start_v1_tp1.sh +++ b/examples/splitwise/start_v1_tp1.sh @@ -14,26 +14,16 @@ export KVCACHE_GDRCOPY_FLUSH_ENABLE=1 SCRIPT_PATH=$(readlink -f "$0") SCRIPT_DIR=$(dirname "$SCRIPT_PATH") -export $(bash ${SCRIPT_DIR}/../../scripts/get_rdma_nics.sh gpu) -echo "KVCACHE_RDMA_NICS:${KVCACHE_RDMA_NICS}" -if [ -z "${KVCACHE_RDMA_NICS}" ]; then - echo "KVCACHE_RDMA_NICS is empty, please check the output of get_rdma_nics.sh" - exit 1 -fi +source ${SCRIPT_DIR}/utils.sh unset http_proxy && unset https_proxy rm -rf log_* -source ./utils.sh P_PORT=52400 D_PORT=52500 ROUTER_PORT=52700 -ports=( - $P_PORT $((P_PORT + 1)) $((P_PORT + 2)) $((P_PORT + 3)) $((P_PORT + 4)) $((P_PORT + 5)) - $D_PORT $((D_PORT + 1)) $((D_PORT + 2)) $((D_PORT + 3)) $((D_PORT + 4)) $((D_PORT + 5)) - $ROUTER_PORT -) +ports=($P_PORT $D_PORT $ROUTER_PORT) check_ports "${ports[@]}" || { echo "❌ Some ports are in use. Please release them." exit 1 @@ -56,14 +46,7 @@ mkdir -p ${FD_LOG_DIR} nohup python -m fastdeploy.entrypoints.openai.api_server \ --model ${MODEL_NAME} \ --port "${P_PORT}" \ - --metrics-port "$((P_PORT + 1))" \ - --engine-worker-queue-port "$((P_PORT + 2))" \ - --cache-queue-port "$((P_PORT + 3))" \ - --max-model-len 32768 \ --splitwise-role "prefill" \ - --cache-transfer-protocol "rdma" \ - --rdma-comm-ports "$((P_PORT + 4))" \ - --pd-comm-port "$((P_PORT + 5))" \ --router "0.0.0.0:${ROUTER_PORT}" \ 2>&1 >${FD_LOG_DIR}/nohup & @@ -77,14 +60,7 @@ mkdir -p ${FD_LOG_DIR} nohup python -m fastdeploy.entrypoints.openai.api_server \ --model ${MODEL_NAME} \ --port "${D_PORT}" \ - --metrics-port "$((D_PORT + 2))" \ - --engine-worker-queue-port "$((D_PORT + 3))" \ - --cache-queue-port "$((D_PORT + 1))" \ - --max-model-len 32768 \ --splitwise-role "decode" \ - --cache-transfer-protocol "rdma" \ - --rdma-comm-ports "$((D_PORT + 4))" \ - --pd-comm-port "$((D_PORT + 5))" \ --router "0.0.0.0:${ROUTER_PORT}" \ 2>&1 >${FD_LOG_DIR}/nohup & diff --git a/examples/splitwise/utils.sh b/examples/splitwise/utils.sh index af6d741032e..31ef8fc5b61 100644 --- a/examples/splitwise/utils.sh +++ b/examples/splitwise/utils.sh @@ -2,7 +2,7 @@ is_port_free() { local port=$1 - if ss -ltn | awk '{print $4}' | grep -q ":${port}$"; then + if ss -ltun | awk '{print $4}' | grep -q ":${port}$"; then return 1 # Port is occupied fi return 0 # Port is free @@ -28,6 +28,7 @@ wait_for_health() { local NC='\033[0m' # No Color local start_time=$(date +%s) + echo "-------- WAIT FOR HEALTH --------" while true; do local all_ready=true for port in "${server_ports[@]}"; do @@ -44,11 +45,12 @@ wait_for_health() { echo "All services are ready! [$((cur_time-start_time))s]" break else - echo "Waiting for services... [$((cur_time-start_time))s]" + echo "Services not ready.. [$((cur_time-start_time))s]" printf "\033[%dA" "$total_lines" # roll back cursor sleep 1 fi done + echo "---------------------------------" } get_free_ports() { diff --git a/fastdeploy/cache_manager/cache_messager.py b/fastdeploy/cache_manager/cache_messager.py index 25a3b50e6e9..e743a0d47b0 100644 --- a/fastdeploy/cache_manager/cache_messager.py +++ b/fastdeploy/cache_manager/cache_messager.py @@ -62,7 +62,7 @@ def parse_args(): parser.add_argument("--value_cache_shape", type=str, default="", help="value cache shape") parser.add_argument("--rdma_port", type=str, default="", help="rmda port") parser.add_argument("--mp_num", type=int, default=1, help="number of model parallel, i.e. tp_size, tp_num") - parser.add_argument("--engine_pid", type=str, default=None, help="engine pid") + parser.add_argument("--ipc_suffix", type=str, default=None, help="ipc suffix") parser.add_argument( "--protocol", type=str, @@ -917,7 +917,7 @@ def main(): name="cache_ready_signal", array=cache_ready_signal_data, dtype=np.int32, - suffix=args.engine_pid, + suffix=args.ipc_suffix, create=False, ) cache_ready_signal.value[rank] = 1 diff --git a/fastdeploy/cache_manager/cache_transfer_manager.py b/fastdeploy/cache_manager/cache_transfer_manager.py index b2b8218c805..7f6f2b7b9e0 100644 --- a/fastdeploy/cache_manager/cache_transfer_manager.py +++ b/fastdeploy/cache_manager/cache_transfer_manager.py @@ -78,7 +78,7 @@ def parse_args(): help="engine worker queue port", ) parser.add_argument("--num_cpu_blocks", type=int, default=4, help="cpu cache block number") - parser.add_argument("--engine_pid", type=str, default=None, help="engine pid") + parser.add_argument("--ipc_suffix", type=str, default=None, help="engine pid") parser.add_argument( "--protocol", type=str, @@ -132,7 +132,7 @@ def __init__(self, args): self.n_ranks = args.mp_num self.rank = rank self.device = device - self.engine_pid = args.engine_pid + self.ipc_suffix = args.ipc_suffix self.cache_dtype = args.cache_dtype address = (args.pod_ip, args.cache_queue_port) @@ -149,7 +149,7 @@ def __init__(self, args): name="cache_ready_signal", array=cache_ready_signal_data, dtype=np.int32, - suffix=self.engine_pid, + suffix=self.ipc_suffix, create=False, ) swap_space_ready_data = np.zeros(shape=[args.mp_num], dtype=np.int32) @@ -157,7 +157,7 @@ def __init__(self, args): name="swap_space_ready_signal", array=swap_space_ready_data, dtype=np.int32, - suffix=self.engine_pid, + suffix=self.ipc_suffix, create=False, ) @@ -172,7 +172,7 @@ def __init__(self, args): name="cache_task_broadcast_signal", array=cache_task_broadcast_data, dtype=np.int32, - suffix=args.engine_pid, + suffix=args.ipc_suffix, create=False, ) @@ -645,7 +645,7 @@ def clear_or_update_caches(self, args): name="kv_cache_status", array=kv_cache_status, dtype=np.int32, - suffix=self.engine_pid, + suffix=self.ipc_suffix, create=False, ) while True: diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py index a3c610965a5..af6a6eef595 100644 --- a/fastdeploy/cache_manager/prefix_cache_manager.py +++ b/fastdeploy/cache_manager/prefix_cache_manager.py @@ -167,7 +167,7 @@ def launch_cache_manager( device_ids, pod_ip, engine_worker_queue_port, - pid_suffix, + ipc_suffix, create_cache_tensor, ): """ @@ -210,7 +210,7 @@ def launch_cache_manager( val_cache_shape, pod_ip, engine_worker_queue_port, - pid_suffix, + ipc_suffix, ) if cache_messager_processes is None: raise RuntimeError("Launch cache messager failed") @@ -274,10 +274,10 @@ def launch_cache_manager( + f" --pod_ip {pod_ip}" + f" --engine_worker_queue_port {engine_worker_queue_port}" + f" --num_cpu_blocks {cache_config.num_cpu_blocks}" - + f" --engine_pid {pid_suffix}" + + f" --ipc_suffix {ipc_suffix}" + f" --protocol {cache_config.cache_transfer_protocol}" + f" --local_data_parallel_id {self.local_data_parallel_id}" - + f" --rdma_port {cache_config.rdma_comm_ports[i] if cache_config.rdma_comm_ports is not None else '0'}" + + f" --rdma_port {cache_config.rdma_comm_ports[0] if cache_config.rdma_comm_ports is not None else '0'}" + f" --speculative_config '{self.speculative_config.to_json_string()}'" + (" --create_cache_tensor" if create_cache_tensor else "") + f" >{log_dir}/launch_cache_transfer_manager_tprank{i}.log 2>&1" @@ -320,7 +320,7 @@ def launch_cache_messager( value_cache_shape, pod_ip, engine_worker_queue_port, - pid_suffix, + ipc_suffix, ): """ launch_cache_messager function used to initialize the cache messager. @@ -333,7 +333,7 @@ def launch_cache_messager( name="cache_ready_signal", array=cache_ready_signal_data, dtype=np.int32, - suffix=pid_suffix, + suffix=ipc_suffix, create=False, ) @@ -369,8 +369,8 @@ def launch_cache_messager( + f" --engine_worker_queue_port {engine_worker_queue_port}" + f" --protocol {cache_config.cache_transfer_protocol}" + f" --local_data_parallel_id {self.local_data_parallel_id}" - + f" --engine_pid {pid_suffix}" - + f" --rdma_port {cache_config.rdma_comm_ports[i] if cache_config.rdma_comm_ports is not None else '0'}" + + f" --ipc_suffix {ipc_suffix}" + + f" --rdma_port {cache_config.rdma_comm_ports[0] if cache_config.rdma_comm_ports is not None else '0'}" + f" --speculative_config '{self.speculative_config.to_json_string()}'" + f" >{log_dir}/launch_cache_messager_tprank{i}.log 2>&1" ) diff --git a/fastdeploy/cache_manager/transfer_factory/get_rdma_nics.sh b/fastdeploy/cache_manager/transfer_factory/get_rdma_nics.sh new file mode 100644 index 00000000000..4fc07a98c9a --- /dev/null +++ b/fastdeploy/cache_manager/transfer_factory/get_rdma_nics.sh @@ -0,0 +1,225 @@ +#!/bin/bash +Cur_Dir=$(cd `dirname $0`; pwd) +NICNAME_TYPE=xgbe # 默认检测类型 +type=$1 + +if [ "$ENABLE_EP_DP" == "1" ]; then + gpu_root_port_filename="${Cur_Dir}/gpu_rootport_${DP_RANK}.txt" +else + gpu_root_port_filename="${Cur_Dir}/gpu_rootport.txt" +fi + +function __NEW_GPU_ROOTPORT_FILE__() { + touch ${gpu_root_port_filename} 2>/dev/null + echo "" > ${gpu_root_port_filename} 2>/dev/null + for gpu_bus in $(lspci 2>/dev/null | grep -iE "Communication controller: | controller: NVIDIA" | awk '{print $1}') + do + readlink "/sys/bus/pci/devices/0000:${gpu_bus}" 2>/dev/null | awk -F [/] '{print $6}' >> ${gpu_root_port_filename} + done +} + +function __RM_GPU_ROOTPORT_FILE__() { + rm -rf ${gpu_root_port_filename} 2>/dev/null +} + +function __JUDGE_NIC_TYPE__() { + XGBE_NUM=$(ip a 2>/dev/null | grep -c ": ${NICNAME_TYPE}") + gpu_first=true + xpu_first=true + cpu_first=true + + for (( xgbe_no=0; xgbe_no < XGBE_NUM; xgbe_no++ )) + do + [ ! -d "/sys/class/net/${NICNAME_TYPE}${xgbe_no}" ] && continue + + PCI_ADDRESS=$(ethtool -i "${NICNAME_TYPE}${xgbe_no}" 2>/dev/null | awk -F '0000:' '/bus-info/{print $2}') + [ -z "$PCI_ADDRESS" ] && continue + NIC_ROOTPORT=$(readlink "/sys/bus/pci/devices/0000:${PCI_ADDRESS}" 2>/dev/null | awk -F '/' '{print $6}') + + NIC_TYPE="CPU_NIC" + grep -qxF "$NIC_ROOTPORT" ${gpu_root_port_filename} 2>/dev/null && NIC_TYPE="GPU_NIC" + + if [[ "$type" == "gpu" && "$NIC_TYPE" == "GPU_NIC" ]]; then + ibdev=$(ibdev2netdev 2>/dev/null | awk -v nic="${NICNAME_TYPE}${xgbe_no}" '$5 == nic {print $1}') + if [ -n "$ibdev" ] && ip link show "${NICNAME_TYPE}${xgbe_no}" | grep -q "state UP"; then + if $gpu_first; then + printf "KVCACHE_RDMA_NICS=%s" "$ibdev" + gpu_first=false + else + printf ",%s" "$ibdev" + fi + fi + fi + + if [[ "$type" == "xpu" && "$NIC_TYPE" == "GPU_NIC" ]]; then + ibdev=$(ibdev2netdev 2>/dev/null | awk -v nic="${NICNAME_TYPE}${xgbe_no}" '$5 == nic {print $1}') + if [ -n "$ibdev" ] && ip link show "${NICNAME_TYPE}${xgbe_no}" | grep -q "state UP"; then + if $xpu_first; then + printf "KVCACHE_RDMA_NICS=%s,%s" "$ibdev" "$ibdev" + xpu_first=false + else + printf ",%s,%s" "$ibdev" "$ibdev" + fi + fi + fi + + if [[ "$type" == "cpu" ]]; then + for (( xgbe_no=0; xgbe_no < XGBE_NUM; xgbe_no++ )) + do + [ ! -d "/sys/class/net/${NICNAME_TYPE}${xgbe_no}" ] && continue + + PCI_ADDRESS=$(ethtool -i "${NICNAME_TYPE}${xgbe_no}" 2>/dev/null | awk -F '0000:' '/bus-info/{print $2}') + [ -z "$PCI_ADDRESS" ] && continue + + NIC_ROOTPORT=$(readlink "/sys/bus/pci/devices/0000:${PCI_ADDRESS}" 2>/dev/null | awk -F '/' '{print $6}') + grep -qxF "$NIC_ROOTPORT" ${gpu_root_port_filename} 2>/dev/null && continue + + if ip link show "${NICNAME_TYPE}${xgbe_no}" | grep -q "state UP" && \ + ip a show "${NICNAME_TYPE}${xgbe_no}" | grep -q "inet"; then + printf "KV_CACHE_SOCKET_IFNAME=%s\n" "${NICNAME_TYPE}${xgbe_no}" + return 0 + fi + done + echo "ERROR: No active CPU NIC with IP found!" >&2 + return 1 + fi + + if [[ "$type" == "cpu_ib" && "$NIC_TYPE" == "CPU_NIC" ]]; then + ibdev=$(ibdev2netdev 2>/dev/null | awk -v nic="${NICNAME_TYPE}${xgbe_no}" '$5 == nic {print $1}') + if [ -n "$ibdev" ] && ip link show "${NICNAME_TYPE}${xgbe_no}" | grep -q "state UP" && \ + ip a show "${NICNAME_TYPE}${xgbe_no}" | grep -q "inet "; then + if $cpu_ib_first; then + printf "KVCACHE_RDMA_NICS=%s" "$ibdev" + cpu_ib_first=false + else + printf ",%s" "$ibdev" + fi + fi + fi + + done + + case "$type" in + gpu) ! $gpu_first && printf "\n" ;; + xpu) ! $xpu_first && printf "\n" ;; + cpu) ! $cpu_first && printf "\n" ;; + cpu_ib) ! $cpu_ib_first && printf "\n" ;; + esac +} + +function get_vxpu_nics() { + local topo_output=$(xpu-smi topo -m) + local xpu_info=$(echo "$topo_output" | grep -E '^XPU[0-9]+') + + local nic_mapping=() + while IFS= read -r line; do + if [[ $line =~ NIC([0-9]+):\ +(mlx[0-9_]+) ]]; then + local nic_idx=${BASH_REMATCH[1]} + local nic_name=${BASH_REMATCH[2]} + nic_mapping[$nic_idx]=$nic_name + fi + done < <(echo "$topo_output" | grep -E '^\s*NIC[0-9]+:') + + local nic_count=${#nic_mapping[@]} + + declare -A priority_map=([PIX]=2 [NODE]=1 [SYS]=0) + local optimal_nics=() + + while IFS= read -r line; do + local fields=($line) + local nic_start_index=5 + local max_nics=$(( ${#fields[@]} - nic_start_index )) + local actual_nic_count=$(( max_nics < nic_count ? max_nics : nic_count )) + + local best_priority=-1 + local best_nic="" + + for ((nic_idx=0; nic_idx best_priority )); then + best_priority=$current_priority + best_nic="${nic_mapping[$nic_idx]}" + fi + done + + if [[ -n "$best_nic" ]]; then + optimal_nics+=("$best_nic") + fi + done <<< "$xpu_info" + + local IFS=, + export KVCACHE_RDMA_NICS="${optimal_nics[*]}" + echo "KVCACHE_RDMA_NICS=${optimal_nics[*]}" +} + +function get_vcpu_nics() { + ip -o addr show | awk '$3 == "inet" && $4 ~ /^10\./ {print "KV_CACHE_SOCKET_IFNAME="$2; exit}' +} + +function __main__() { + if [[ "$type" == "vxpu" ]]; then + get_vxpu_nics + return 0 + fi + if [[ "$type" == "vcpu" ]]; then + get_vcpu_nics + return 0 + fi + + # 处理 bond 情况 + if [[ "$type" == "cpu" ]]; then + for bond in $(ls -d /sys/class/net/bond* 2>/dev/null); do + bond_if=$(basename "$bond") + if ip link show "$bond_if" | grep -q "state UP" && \ + ip a show "$bond_if" | grep -q "inet "; then + printf "KV_CACHE_SOCKET_IFNAME=%s\n" "$bond_if" + return 0 + fi + done + fi + + if [[ "$type" == "cpu_ib" ]]; then + first=true + for bond in $(ls -d /sys/class/net/bond* 2>/dev/null); do + bond_if=$(basename "$bond") + __NEW_GPU_ROOTPORT_FILE__ + + ibdev=$(ibdev2netdev 2>/dev/null | grep -w "$bond_if" | awk '{print $1}') + if [ -n "$ibdev" ] && ip link show "$bond_if" | grep -q "state UP" && \ + ip a show "$bond_if" | grep -q "inet "; then + if $first; then + printf "KVCACHE_RDMA_NICS=%s" "$ibdev" + first=false + else + printf ",%s" "$ibdev" + fi + fi + + bondib=$(show_gids 2>/dev/null | grep -w "$bond_if" | awk '{print $1}' | grep "mlx.*bond" | head -1) + if [ -n "$bondib" ] && ip link show "$bond_if" | grep -q "state UP" && \ + ip a show "$bond_if" | grep -q "inet " && $first; then + printf "KVCACHE_RDMA_NICS=%s" "$bondib" + first=false + fi + + __RM_GPU_ROOTPORT_FILE__ + done + + ! $first && printf "\n" + [ ! $first ] && return 0 + fi + + local nic_types=("eth" "ib" "xgbe") + for nt in "${nic_types[@]}"; do + if ip a | grep -iq "$nt"; then + __NEW_GPU_ROOTPORT_FILE__ + NICNAME_TYPE=$nt + __JUDGE_NIC_TYPE__ + __RM_GPU_ROOTPORT_FILE__ + fi + done +} + +__main__ diff --git a/fastdeploy/cache_manager/transfer_factory/rdma_cache_transfer.py b/fastdeploy/cache_manager/transfer_factory/rdma_cache_transfer.py index f90a5d23234..2de855c86bd 100644 --- a/fastdeploy/cache_manager/transfer_factory/rdma_cache_transfer.py +++ b/fastdeploy/cache_manager/transfer_factory/rdma_cache_transfer.py @@ -14,6 +14,8 @@ # limitations under the License. """ +import traceback + from fastdeploy.utils import get_logger logger = get_logger("cache_messager", "cache_messager.log") @@ -37,13 +39,56 @@ def __init__( prefill_tp_size, prefill_tp_idx, ): + try: + import os + import subprocess + + from fastdeploy.platforms import current_platform + + if os.getenv("KVCACHE_GDRCOPY_FLUSH_ENABLE", "") == "" and current_platform.is_cuda(): + result = subprocess.run( + ["nvidia-smi", "-i", "0", "--query-gpu=compute_cap", "--format=csv,noheader"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=False, + ) + if result.returncode != 0: + raise RuntimeError(f"Failed to get compute capability via nvidia-smi: {result.stderr.strip()}") + + major, minor = result.stdout.strip().split(".") + if major == "8": # for ampere arch + os.environ["KVCACHE_GDRCOPY_FLUSH_ENABLE"] = "1" + logger.info("Setting environment variable: export KVCACHE_GDRCOPY_FLUSH_ENABLE=1") + + if os.getenv("KVCACHE_RDMA_NICS", "") == "": + get_rdma_nics = os.path.join(os.path.dirname(__file__), "get_rdma_nics.sh") + nic_type = current_platform.device_name + result = subprocess.run( + ["bash", get_rdma_nics, nic_type], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + check=False, + ) + if result.returncode != 0: + raise RuntimeError(f"Failed to execute script `get_rdma_nics.sh`: {result.stderr.strip()}") + + env_name, env_value = result.stdout.strip().split("=") + assert env_name == "KVCACHE_RDMA_NICS" + os.environ[env_name] = env_value + logger.info(f"Setting environment variable: export {env_name}={env_value}") + + except Exception as e: + raise RuntimeError(f"Failed to initialize RDMA environment! {e} {traceback.format_exc()}") + try: import rdma_comm except: raise RuntimeError( - "The installation of the RDMA library failed." - "Confirm whether your network card supports RDMA transmission." + "The installation of the RDMA library failed. Confirm whether your network card supports RDMA transmission." ) + self.messager = rdma_comm.RDMACommunicator( splitwise_role, gpu_id, diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 58e7c4f3144..ddc75d6ad4e 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -33,7 +33,13 @@ from fastdeploy.platforms import current_platform from fastdeploy.scheduler import SchedulerConfig from fastdeploy.transformer_utils.config import get_pooling_config -from fastdeploy.utils import ceil_div, check_unified_ckpt, get_host_ip, get_logger +from fastdeploy.utils import ( + ceil_div, + check_unified_ckpt, + get_host_ip, + get_logger, + parse_ports, +) logger = get_logger("config", "config.log") @@ -553,7 +559,8 @@ def __init__( self.local_data_parallel_id = 0 # Engine worker queue port - self.engine_worker_queue_port: str = "9923" + self.engine_worker_queue_port: Union[int, str, list] = None + self.local_engine_worker_queue_port: Optional[int] = None # cuda visible devices self.device_ids: str = "0" # First token id @@ -573,11 +580,9 @@ def __init__( for key, value in args.items(): if hasattr(self, key): setattr(self, key, value) - if isinstance(self.engine_worker_queue_port, str): - self.engine_worker_queue_port = [int(port) for port in self.engine_worker_queue_port.split(",")] - logger.info(f"engine_worker_queue_port: {self.engine_worker_queue_port}") - elif isinstance(self.engine_worker_queue_port, int): - self.engine_worker_queue_port = [self.engine_worker_queue_port] + + self.engine_worker_queue_port = parse_ports(self.engine_worker_queue_port) + # currently, the expert parallel size is equal data parallel size if self.enable_expert_parallel: self.expert_parallel_size = self.data_parallel_size * self.tensor_parallel_size @@ -1275,11 +1280,9 @@ def __init__(self, args): if hasattr(self, key): setattr(self, key, value) - if self.rdma_comm_ports is not None and isinstance(self.rdma_comm_ports, str): - self.rdma_comm_ports = self.rdma_comm_ports.split(",") - - if self.pd_comm_port is not None and isinstance(self.pd_comm_port, str): - self.pd_comm_port = [int(port) for port in self.pd_comm_port.split(",")] + self.cache_queue_port = parse_ports(self.cache_queue_port) + self.rdma_comm_ports = parse_ports(self.rdma_comm_ports) + self.pd_comm_port = parse_ports(self.pd_comm_port) if self.swap_space is None: self.enable_hierarchical_cache = False @@ -1631,7 +1634,6 @@ def postprocess(self): """ calculate some parameters """ - self.local_device_ids = self.parallel_config.device_ids.split(",")[: self.parallel_config.tensor_parallel_size] if self.parallel_config.tensor_parallel_size <= self.worker_num_per_node or self.node_rank == 0: self.is_master = True @@ -1735,6 +1737,35 @@ def postprocess(self): else: raise NotImplementedError + # get devices and ports for current dp + self.local_device_ids = self.parallel_config.device_ids.split(",")[ + self.parallel_config.local_data_parallel_id + * self.parallel_config.tensor_parallel_size : (self.parallel_config.local_data_parallel_id + 1) + * self.parallel_config.tensor_parallel_size + ] + self.parallel_config.local_engine_worker_queue_port = self.parallel_config.engine_worker_queue_port[ + self.parallel_config.local_data_parallel_id + ] + self.cache_config.cache_queue_port = ( + self.cache_config.cache_queue_port[self.parallel_config.local_data_parallel_id] + if self.cache_config.cache_queue_port + else None + ) + self.cache_config.pd_comm_port = ( + self.cache_config.pd_comm_port[self.parallel_config.local_data_parallel_id] + if self.cache_config.pd_comm_port + else None + ) + self.cache_config.rdma_comm_ports = ( + self.cache_config.rdma_comm_ports[ + self.parallel_config.local_data_parallel_id + * self.parallel_config.tensor_parallel_size : (self.parallel_config.local_data_parallel_id + 1) + * self.parallel_config.tensor_parallel_size + ] + if self.cache_config.rdma_comm_ports + else None + ) + def check(self): """ check the legality of config @@ -1883,18 +1914,6 @@ def init_cache_info(self): elif self.scheduler_config.name == "local" and self.router_config and self.router_config.router: self.splitwise_version = "v1" - if isinstance(self.parallel_config.engine_worker_queue_port, (int, str)): - engine_worker_queue_port = self.parallel_config.engine_worker_queue_port - else: - engine_worker_queue_port = self.parallel_config.engine_worker_queue_port[ - self.parallel_config.local_data_parallel_id - ] - connector_port = ( - self.cache_config.pd_comm_port[self.parallel_config.local_data_parallel_id] - if self.cache_config.pd_comm_port - else None - ) - self.disaggregate_info = {} if self.scheduler_config.splitwise_role != "mixed": self.disaggregate_info["role"] = self.scheduler_config.splitwise_role @@ -1906,13 +1925,13 @@ def init_cache_info(self): if protocol == "ipc": self.disaggregate_info["cache_info"][protocol] = { "ip": self.host_ip, - "port": engine_worker_queue_port, + "port": self.parallel_config.local_engine_worker_queue_port, "device_ids": self.local_device_ids, } elif protocol == "rdma": self.disaggregate_info["cache_info"][protocol] = { "ip": self.host_ip, - "port": connector_port, + "port": self.cache_config.pd_comm_port, "rdma_port": self.cache_config.rdma_comm_ports, } logger.info(f"disaggregate_info: {self.disaggregate_info}") @@ -1923,9 +1942,9 @@ def init_cache_info(self): "role": self.scheduler_config.splitwise_role, "host_ip": self.host_ip, "port": self.router_config.api_server_port, - "connector_port": connector_port, + "connector_port": self.cache_config.pd_comm_port, "rdma_ports": self.cache_config.rdma_comm_ports, - "engine_worker_queue_port": engine_worker_queue_port, + "engine_worker_queue_port": self.parallel_config.local_engine_worker_queue_port, "device_ids": self.local_device_ids, "transfer_protocol": self.cache_config.cache_transfer_protocol.split(","), "tp_size": self.parallel_config.tensor_parallel_size, diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index f8cf662ef73..c759b8feef5 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -46,7 +46,9 @@ DeprecatedOptionWarning, FlexibleArgumentParser, console_logger, + find_free_ports, is_port_available, + parse_ports, parse_quantization, ) @@ -223,7 +225,7 @@ class EngineArgs: The amount of CPU memory to offload to. """ - cache_queue_port: str = "0" + cache_queue_port: Optional[Union[int, str, list]] = None """ Port for cache queue. """ @@ -265,7 +267,7 @@ class EngineArgs: # This optimization is enabled by default, and can be disabled by using this flag. """ - engine_worker_queue_port: str = "0" + engine_worker_queue_port: Optional[Union[int, str, list]] = None """ Port for worker queue communication. """ @@ -300,17 +302,17 @@ class EngineArgs: Chunk size of moe input. """ - cache_transfer_protocol: str = "ipc" + cache_transfer_protocol: str = "ipc,rdma" """ Protocol to use for cache transfer. """ - pd_comm_port: Optional[List[int]] = None + pd_comm_port: Optional[Union[int, str, list]] = None """ Port for splitwise communication. """ - rdma_comm_ports: Optional[List[int]] = None + rdma_comm_ports: Optional[Union[int, str, list]] = None """ Ports for rdma communication. """ @@ -504,8 +506,6 @@ def __post_init__(self): self.enable_prefix_caching = False if not current_platform.is_cuda() and not current_platform.is_xpu() and not current_platform.is_intel_hpu(): self.enable_prefix_caching = False - # if self.dynamic_load_weight: - # self.enable_prefix_caching = False if self.enable_logprob: if not current_platform.is_cuda() and not current_platform.is_xpu(): raise NotImplementedError("Only CUDA and XPU platforms support logprob.") @@ -526,27 +526,6 @@ def __post_init__(self): f"scheduler, please provide --router argument." ) - if "rdma" in self.cache_transfer_protocol: - if self.rdma_comm_ports is None: - raise ValueError( - "Please set --rdma_comm_ports argument when using " "rdma cache transfer protocol." - ) - num_nodes = len(self.ips) if self.ips else 1 - if self.data_parallel_size % num_nodes != 0: - raise ValueError( - f"data_parallel_size ({self.data_parallel_size}) must be divisible by " - f"num_nodes ({num_nodes})." - ) - dp_per_node = self.data_parallel_size // num_nodes - expected_ports = self.tensor_parallel_size * dp_per_node - if len(self.rdma_comm_ports) != expected_ports: - raise ValueError( - f"The number of rdma_comm_ports must equal " - f"tensor_parallel_size * (data_parallel_size / num_nodes) = " - f"{self.tensor_parallel_size} * ({self.data_parallel_size} / {num_nodes}) " - f"= {expected_ports}, but got {len(self.rdma_comm_ports)}." - ) - if not (current_platform.is_cuda() or current_platform.is_xpu() or current_platform.is_maca()): envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 @@ -555,6 +534,52 @@ def __post_init__(self): self.enable_prefix_caching = False self.max_encoder_cache = 0 + self.post_init_all_ports() + + def post_init_all_ports(self): + + def post_init_ports(name: str, ports: list, num_total_ports: int): + num_expected_ports = num_total_ports + if envs.FD_ENABLE_MULTI_API_SERVER: + num_expected_ports //= self.data_parallel_size + if ports is None: + ports = find_free_ports(num_ports=num_expected_ports) + console_logger.info(f"Parameter `{name}` is not specified, found available ports for use: {ports}") + else: + assert ( + len(ports) == num_total_ports + ), f"Parameter `{name}` should have {num_total_ports} ports, got {len(ports)}." + ports = parse_ports(ports) + for port in ports: + assert is_port_available("0.0.0.0", port), f"Parameter `{name}`:{port} is already in use." + return ports + + num_nodes = len(self.ips) if self.ips else 1 + if self.data_parallel_size % num_nodes != 0: + raise ValueError( + f"data_parallel_size ({self.data_parallel_size}) must be divisible by num_nodes ({num_nodes})." + ) + self.engine_worker_queue_port = post_init_ports( + "engine_worker_queue_port", + self.engine_worker_queue_port, + self.data_parallel_size // num_nodes, + ) + self.cache_queue_port = post_init_ports( + "cache_queue_port", + self.cache_queue_port, + self.data_parallel_size // num_nodes, + ) + self.rdma_comm_ports = post_init_ports( + "rdma_comm_ports", + self.rdma_comm_ports, + self.tensor_parallel_size * self.data_parallel_size // num_nodes, + ) + self.pd_comm_port = post_init_ports( + "pd_comm_port", + self.pd_comm_port, + self.data_parallel_size // num_nodes, + ) + @staticmethod def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: """ @@ -1266,11 +1291,6 @@ def create_engine_config(self, port_availability_check=True) -> FDConfig: else: self.max_num_batched_tokens = self.max_model_len - if isinstance(self.engine_worker_queue_port, int): - self.engine_worker_queue_port = str(self.engine_worker_queue_port) - if isinstance(self.engine_worker_queue_port, str): - self.engine_worker_queue_port = self.engine_worker_queue_port.split(",") - all_dict = asdict(self) all_dict["model_cfg"] = model_cfg cache_cfg = CacheConfig(all_dict) @@ -1285,10 +1305,6 @@ def create_engine_config(self, port_availability_check=True) -> FDConfig: early_stop_cfg = self.create_early_stop_config() early_stop_cfg.update_enable_early_stop(self.enable_early_stop) structured_outputs_config: StructuredOutputsConfig = StructuredOutputsConfig(args=all_dict) - if port_availability_check: - assert is_port_available( - "0.0.0.0", int(self.engine_worker_queue_port[parallel_cfg.local_data_parallel_id]) - ), f"The parameter `engine_worker_queue_port`:{self.engine_worker_queue_port} is already in use." return FDConfig( model_config=model_cfg, diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py index cc02363c411..019a44805a6 100644 --- a/fastdeploy/engine/common_engine.py +++ b/fastdeploy/engine/common_engine.py @@ -74,13 +74,6 @@ def __init__(self, cfg, start_queue=True): cfg (Config): Config object containing all the configuration parameters. """ self.cfg = cfg - if cfg.scheduler_config.splitwise_role != "mixed" or cfg.cache_config.enable_prefix_caching: - if isinstance(self.cfg.cache_config.cache_queue_port, str): - self.cfg.cache_config.cache_queue_port = self.cfg.cache_config.cache_queue_port.split(",") - if isinstance(self.cfg.cache_config.cache_queue_port, list): - self.cfg.cache_config.cache_queue_port = int( - self.cfg.cache_config.cache_queue_port[self.cfg.parallel_config.local_data_parallel_id] - ) if self.cfg.parallel_config.data_parallel_size > 1: self.llm_logger = get_logger( @@ -113,9 +106,7 @@ def __init__(self, cfg, start_queue=True): self.start_worker_queue_service(start_queue) - os.environ["INFERENCE_MSG_QUEUE_ID"] = self.cfg.parallel_config.engine_worker_queue_port[ - self.cfg.parallel_config.local_data_parallel_id - ] + os.environ["INFERENCE_MSG_QUEUE_ID"] = str(self.cfg.parallel_config.local_engine_worker_queue_port) self.split_connector = SplitwiseConnector(cfg, self.engine_worker_queue, self.resource_manager) self.token_processor = TokenProcessor( @@ -144,9 +135,7 @@ def __init__(self, cfg, start_queue=True): self._init_worker_monitor_signals() if self.cfg.eplb_config.enable_eplb: - current_suffix = int( - self.cfg.parallel_config.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id] - ) + current_suffix = self.cfg.parallel_config.local_engine_worker_queue_port init_eplb_signals(cfg, current_suffix) self._finalizer = weakref.finalize(self, self._exit_sub_services) @@ -178,9 +167,7 @@ def create_data_processor(self): self.data_processor = self.input_processor.create_processor() def _init_worker_monitor_signals(self): # exist_task_signal 用于各worker进程感知是否有新Task需要处理 - current_suffix = int( - self.cfg.parallel_config.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id] - ) + current_suffix = self.cfg.parallel_config.local_engine_worker_queue_port self.llm_logger.info(f"current_suffix: {current_suffix}") exist_task_signal_data = np.zeros([1], dtype=np.int32) self.exist_task_signal = IPCSignal( @@ -272,16 +259,10 @@ def start_worker_queue_service(self, start_queue): """ start queue service for engine worker communication """ - if not envs.FD_ENGINE_TASK_QUEUE_WITH_SHM: - address = ( - self.cfg.master_ip, - int( - self.cfg.parallel_config.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id] - ), - ) + address = (self.cfg.master_ip, self.cfg.parallel_config.local_engine_worker_queue_port) else: - address = f"/dev/shm/fd_task_queue_{self.cfg.parallel_config.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id]}.sock" + address = f"/dev/shm/fd_task_queue_{self.cfg.parallel_config.local_engine_worker_queue_port}.sock" if start_queue and (self.cfg.host_ip == self.cfg.master_ip or self.cfg.master_ip == "0.0.0.0"): self.llm_logger.info(f"Starting engine worker queue server service at {address}") @@ -293,16 +274,12 @@ def start_worker_queue_service(self, start_queue): ) # Dynamically updates the port value if an anonymous port is used if not envs.FD_ENGINE_TASK_QUEUE_WITH_SHM: - self.cfg.parallel_config.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id] = ( - str(self.engine_worker_queue_server.get_server_port()) + self.cfg.parallel_config.local_engine_worker_queue_port = ( + self.engine_worker_queue_server.get_server_port() ) address = ( self.cfg.master_ip, - int( - self.cfg.parallel_config.engine_worker_queue_port[ - self.cfg.parallel_config.local_data_parallel_id - ] - ), + self.cfg.parallel_config.local_engine_worker_queue_port, ) if self.cfg.cache_config.enable_prefix_caching or self.cfg.scheduler_config.splitwise_role != "mixed": @@ -1214,10 +1191,8 @@ def start_cache_service(self, device_ids, ipc_signal_suffix): tensor_parallel_size=self.cfg.parallel_config.tensor_parallel_size, device_ids=device_ids, pod_ip=self.cfg.master_ip, - engine_worker_queue_port=int( - self.cfg.parallel_config.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id] - ), - pid_suffix=ipc_signal_suffix, + engine_worker_queue_port=self.cfg.parallel_config.local_engine_worker_queue_port, + ipc_suffix=ipc_signal_suffix, create_cache_tensor=False, ) diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index 4a493843df7..351cda8b4da 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -522,7 +522,7 @@ def _start_worker_service(self): image_patch_id = self.data_processor.tokenizer.get_vocab().get("<|IMAGE_PLACEHOLDER|>", -1) line_break_id = self.data_processor.tokenizer.get_vocab().get("\n", -1) - ports = ",".join(self.cfg.parallel_config.engine_worker_queue_port) + ports = ",".join(map(str, self.cfg.parallel_config.engine_worker_queue_port)) ips = None if self.cfg.ips is not None: ips = ",".join(self.cfg.ips) @@ -730,7 +730,7 @@ def launch_components(self): ) if not envs.FD_ENABLE_MULTI_API_SERVER: - if self.cfg.parallel_config.enable_expert_parallel and self.cfg.parallel_config.data_parallel_size > 1: + if self.cfg.parallel_config.data_parallel_size > 1: self.launched_expert_service_signal.value[0] = 1 self.dp_processed = [] self.dp_engine_worker_queue_server = [] @@ -741,10 +741,12 @@ def launch_components(self): if not envs.FD_ENGINE_TASK_QUEUE_WITH_SHM: address = ( self.cfg.master_ip, - int(self.cfg.parallel_config.engine_worker_queue_port[i]), + int(self.cfg.parallel_config.local_engine_worker_queue_port), ) else: - address = f"/dev/shm/fd_task_queue_{self.cfg.parallel_config.engine_worker_queue_port[i]}.sock" + address = ( + f"/dev/shm/fd_task_queue_{self.cfg.parallel_config.local_engine_worker_queue_port}.sock" + ) llm_logger.info(f"dp start queue service {address}") self.dp_engine_worker_queue_server.append( diff --git a/fastdeploy/engine/expert_service.py b/fastdeploy/engine/expert_service.py index 3b8c40cca3c..d604dbdd9fc 100644 --- a/fastdeploy/engine/expert_service.py +++ b/fastdeploy/engine/expert_service.py @@ -48,10 +48,7 @@ def __init__(self, cfg, local_data_parallel_id, start_queue=True): """ self.cfg = cfg - start_pos = (local_data_parallel_id * self.cfg.parallel_config.tensor_parallel_size) % cfg.worker_num_per_node - end_pos = start_pos + self.cfg.parallel_config.tensor_parallel_size if cfg.scheduler_config.splitwise_role != "mixed": - self.cfg.cache_config.rdma_comm_ports = self.cfg.cache_config.rdma_comm_ports[start_pos:end_pos] if envs.FD_ENABLE_INTERNAL_ADAPTER: envs.FD_ZMQ_RECV_REQUEST_SERVER_PORT = envs.FD_ZMQ_RECV_REQUEST_SERVER_PORTS.split(",")[ local_data_parallel_id @@ -59,7 +56,6 @@ def __init__(self, cfg, local_data_parallel_id, start_queue=True): envs.FD_ZMQ_SEND_RESPONSE_SERVER_PORT = envs.FD_ZMQ_SEND_RESPONSE_SERVER_PORTS.split(",")[ local_data_parallel_id ] - self.cfg.local_device_ids = self.cfg.parallel_config.device_ids.split(",")[start_pos:end_pos] llm_logger.info(f"local_data_parallel_id: {local_data_parallel_id}") if self.cfg.cache_config.num_gpu_blocks_override is None: @@ -67,13 +63,6 @@ def __init__(self, cfg, local_data_parallel_id, start_queue=True): else: self.do_profile = False - if cfg.scheduler_config.splitwise_role != "mixed": - if len(self.cfg.cache_config.pd_comm_port) == 1: - self.cfg.cache_config.pd_comm_port[0] = ( - int(self.cfg.cache_config.pd_comm_port[0]) + local_data_parallel_id - ) - else: - self.cfg.cache_config.pd_comm_port = [self.cfg.cache_config.pd_comm_port[local_data_parallel_id]] self.cfg.parallel_config.local_data_parallel_id = local_data_parallel_id self.engine = EngineService(self.cfg, start_queue) if self.cfg.scheduler_config.name == "splitwise": diff --git a/fastdeploy/splitwise/splitwise_connector.py b/fastdeploy/splitwise/splitwise_connector.py index d82fbec849f..bca1f4a3b02 100644 --- a/fastdeploy/splitwise/splitwise_connector.py +++ b/fastdeploy/splitwise/splitwise_connector.py @@ -73,7 +73,7 @@ def _init_network(self): self.router_socket.setsockopt(zmq.LINGER, 0) self.router_socket.setsockopt(zmq.SNDHWM, 1000) self.router_socket.setsockopt(zmq.ROUTER_MANDATORY, 1) - self.router_socket.bind(f"tcp://*:{self.cfg.cache_config.pd_comm_port[0]}") + self.router_socket.bind(f"tcp://*:{self.cfg.cache_config.pd_comm_port}") self.logger.info(f"_init_network: bind {self.cfg.cache_config.pd_comm_port}") self.poller = zmq.Poller() @@ -335,7 +335,7 @@ def send_cache_info_to_prefill(self, tasks: List[Request]): if dsg_info["transfer_protocol"] == "ipc": info = { "request_id": tasks[i].request_id, - "device_ids": self.cfg.parallel_config.device_ids.split(","), + "device_ids": self.cfg.local_device_ids, "transfer_protocol": "ipc", "dest_block_ids": dsg_info["block_tables"], } @@ -351,11 +351,9 @@ def send_cache_info_to_prefill(self, tasks: List[Request]): else: info = { "request_id": tasks[i].request_id, - "device_ids": [self.cfg.parallel_config.device_ids.split(",")[self.local_data_parallel_id]], + "device_ids": self.cfg.local_device_ids, "ip": self.cfg.host_ip, - "rdma_ports": [ - self.cfg.disaggregate_info["cache_info"]["rdma"]["rdma_port"][self.local_data_parallel_id] - ], + "rdma_ports": self.cfg.disaggregate_info["cache_info"]["rdma"]["rdma_port"], "transfer_protocol": "rdma", "dest_block_ids": dsg_info["block_tables"], "decode_tp_size": self.cfg.parallel_config.tensor_parallel_size, diff --git a/fastdeploy/utils.py b/fastdeploy/utils.py index a0878fa7c73..5c55881c32d 100644 --- a/fastdeploy/utils.py +++ b/fastdeploy/utils.py @@ -603,6 +603,19 @@ def get_random_port(): continue +def parse_ports(ports): + if ports is None: + return None + elif isinstance(ports, int): + return [ports] + elif isinstance(ports, str): + return [int(p) for p in ports.split(",")] + elif isinstance(ports, list): + return [int(p) for p in ports] + else: + raise TypeError(f"Cannot parse ports into List[int]: {ports}") + + def is_port_available(host, port): """ Check the port is available @@ -614,13 +627,65 @@ def is_port_available(host, port): try: s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) s.bind((host, port)) - return True except OSError as e: if e.errno == errno.EADDRINUSE: return False + finally: + s.close() return True +def find_free_ports( + port_range: tuple[int, int] = (8000, 65535), + num_ports: int = 1, + host: str = "0.0.0.0", +) -> list[int]: + """ + Find available TCP ports in a given range, scanning from a random start. + + Args: + port_range: (start, end), inclusive, e.g. (20000, 30000). + num_ports: number of ports to find. + host: host to bind, default "0.0.0.0". + + Returns: + List of available ports with length == num_ports. + + Raises: + ValueError: invalid port range or num_ports <= 0. + RuntimeError: not enough free ports in the range. + """ + start, end = port_range + if start < 0 or end > 65535 or start > end: + raise ValueError(f"Invalid port range: {port_range}") + + if num_ports <= 0: + raise ValueError("num_ports must be a positive integer") + + total_ports = end - start + 1 + if num_ports > total_ports: + raise ValueError("num_ports is larger than range size") + + # Generate all ports and rotate with a random start index + ports = list(range(start, end + 1)) + offset = random.randint(0, total_ports - 1) + ports = ports[offset:] + ports[:offset] + + free_ports: list[int] = [] + + for port in ports: + if is_port_available(host, port): + free_ports.append(port) + + if len(free_ports) >= num_ports: + break + + if len(free_ports) < num_ports: + raise RuntimeError(f"Only found {len(free_ports)} free ports in {port_range}, requested {num_ports}.") + + return free_ports + + def singleton(cls): """ Singleton decorator for a class. diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 02d66f4bc53..74dc9efe9ba 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -928,10 +928,6 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: parallel_config.num_experts_per_rank = num_experts_per_rank parallel_config.num_experts_start_offset = num_experts_start_offset - if args.load_strategy != "meta": - parallel_config.engine_worker_queue_port = parallel_config.engine_worker_queue_port[ - parallel_config.local_data_parallel_id - ] parallel_config.set_communicate_group() load_config = LoadConfig(vars(args)) @@ -1004,6 +1000,12 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: structured_outputs_config=structured_outputs_config, eplb_config=eplb_config, ) + + if args.load_strategy != "meta": + fd_config.parallel_config.engine_worker_queue_port = fd_config.parallel_config.engine_worker_queue_port[ + parallel_config.local_data_parallel_id + ] + update_fd_config_for_mm(fd_config) if fd_config.load_config.load_choices == "default_v1" and not v1_loader_support(fd_config): fd_config.load_config.load_choices = "default" From 29a0ee5153f9b9eb11aebf2225f52eb870d52dc2 Mon Sep 17 00:00:00 2001 From: liyonghua0910 Date: Tue, 9 Dec 2025 11:34:37 +0800 Subject: [PATCH 02/17] [fix] fix some bugs --- examples/splitwise/start_v1_dp2.sh | 6 -- examples/splitwise/start_v1_tp1.sh | 34 ++++---- examples/splitwise/start_v1_tp2.sh | 82 +++++++++++++++++++ fastdeploy/engine/args_utils.py | 10 ++- fastdeploy/engine/common_engine.py | 1 + .../entrypoints/openai/multi_api_server.py | 18 ++-- fastdeploy/worker/gcu_model_runner.py | 2 +- fastdeploy/worker/gpu_model_runner.py | 6 +- fastdeploy/worker/hpu_model_runner.py | 2 +- fastdeploy/worker/metax_model_runner.py | 4 +- fastdeploy/worker/worker_process.py | 28 +++---- 11 files changed, 138 insertions(+), 55 deletions(-) create mode 100644 examples/splitwise/start_v1_tp2.sh diff --git a/examples/splitwise/start_v1_dp2.sh b/examples/splitwise/start_v1_dp2.sh index 21a9562559f..cfcb57126b3 100644 --- a/examples/splitwise/start_v1_dp2.sh +++ b/examples/splitwise/start_v1_dp2.sh @@ -31,7 +31,6 @@ source ${SCRIPT_DIR}/utils.sh # start router ROUTER_PORT=$(get_free_ports 1) -echo "---------------------------" echo ROUTER_PORT: $ROUTER_PORT export FD_LOG_DIR="log/$LOG_DATE/router" @@ -52,7 +51,6 @@ P_ENGINE_WORKER_QUEUE_PORTS=$(get_free_ports $DATA_PARALLEL_SIZE) P_CACHE_QUEUE_PORTS=$(get_free_ports $DATA_PARALLEL_SIZE) P_RDMA_COMM_PORTS=$(get_free_ports $NUM_GPUS) P_PD_COMM_PORTS=$(get_free_ports $DATA_PARALLEL_SIZE) -echo "---------------------------" echo P_SERVER_PORTS: $P_SERVER_PORTS echo P_METRICS_PORTS: $P_METRICS_PORTS echo P_ENGINE_WORKER_QUEUE_PORTS: $P_ENGINE_WORKER_QUEUE_PORTS @@ -82,7 +80,6 @@ nohup python -m fastdeploy.entrypoints.openai.multi_api_server \ --router "0.0.0.0:${ROUTER_PORT}" \ 2>&1 >${FD_LOG_DIR}/nohup & -echo "--- Health Check Status ---" wait_for_health ${P_SERVER_PORTS} @@ -93,7 +90,6 @@ D_CACHE_QUEUE_PORTS=$(get_free_ports $DATA_PARALLEL_SIZE) D_METRICS_PORTS=$(get_free_ports $DATA_PARALLEL_SIZE) D_RDMA_COMM_PORTS=$(get_free_ports $NUM_GPUS) D_PD_COMM_PORTS=$(get_free_ports $DATA_PARALLEL_SIZE) -echo "---------------------------" echo D_SERVER_PORTS: $D_SERVER_PORTS echo D_ENGINE_WORKER_QUEUE_PORTS: $D_ENGINE_WORKER_QUEUE_PORTS echo D_CACHE_QUEUE_PORTS: $D_CACHE_QUEUE_PORTS @@ -123,12 +119,10 @@ nohup python -m fastdeploy.entrypoints.openai.multi_api_server \ --router "0.0.0.0:${ROUTER_PORT}" \ 2>&1 >${FD_LOG_DIR}/nohup & -echo "--- Health Check Status ---" wait_for_health ${D_SERVER_PORTS} # send request -echo "------ Request Check ------" sleep 10 # make sure server is registered to router curl -X POST "http://0.0.0.0:${ROUTER_PORT}/v1/chat/completions" \ -H "Content-Type: application/json" \ diff --git a/examples/splitwise/start_v1_tp1.sh b/examples/splitwise/start_v1_tp1.sh index 7f5a373303b..3d081bf7c83 100644 --- a/examples/splitwise/start_v1_tp1.sh +++ b/examples/splitwise/start_v1_tp1.sh @@ -17,11 +17,11 @@ SCRIPT_DIR=$(dirname "$SCRIPT_PATH") source ${SCRIPT_DIR}/utils.sh unset http_proxy && unset https_proxy -rm -rf log_* P_PORT=52400 D_PORT=52500 ROUTER_PORT=52700 +LOG_DATE=$(date +%Y%m%d_%H%M%S) ports=($P_PORT $D_PORT $ROUTER_PORT) check_ports "${ports[@]}" || { @@ -30,8 +30,8 @@ check_ports "${ports[@]}" || { } # start router -export FD_LOG_DIR="log_router" -mkdir -p ${FD_LOG_DIR} +export FD_LOG_DIR="log/$LOG_DATE/router" +rm -rf ${FD_LOG_DIR} && mkdir -p ${FD_LOG_DIR} nohup python -m fastdeploy.router.launch \ --port ${ROUTER_PORT} \ @@ -40,29 +40,29 @@ nohup python -m fastdeploy.router.launch \ # start prefill export CUDA_VISIBLE_DEVICES=0 -export FD_LOG_DIR="log_prefill" -mkdir -p ${FD_LOG_DIR} +export FD_LOG_DIR="log/$LOG_DATE/prefill" +rm -rf ${FD_LOG_DIR} && mkdir -p ${FD_LOG_DIR} nohup python -m fastdeploy.entrypoints.openai.api_server \ - --model ${MODEL_NAME} \ - --port "${P_PORT}" \ - --splitwise-role "prefill" \ - --router "0.0.0.0:${ROUTER_PORT}" \ - 2>&1 >${FD_LOG_DIR}/nohup & + --model ${MODEL_NAME} \ + --port "${P_PORT}" \ + --splitwise-role "prefill" \ + --router "0.0.0.0:${ROUTER_PORT}" \ +2>&1 >${FD_LOG_DIR}/nohup & wait_for_health ${P_PORT} # start decode export CUDA_VISIBLE_DEVICES=1 -export FD_LOG_DIR="log_decode" -mkdir -p ${FD_LOG_DIR} +export FD_LOG_DIR="log/$LOG_DATE/decode" +rm -rf ${FD_LOG_DIR} && mkdir -p ${FD_LOG_DIR} nohup python -m fastdeploy.entrypoints.openai.api_server \ - --model ${MODEL_NAME} \ - --port "${D_PORT}" \ - --splitwise-role "decode" \ - --router "0.0.0.0:${ROUTER_PORT}" \ - 2>&1 >${FD_LOG_DIR}/nohup & + --model ${MODEL_NAME} \ + --port "${D_PORT}" \ + --splitwise-role "decode" \ + --router "0.0.0.0:${ROUTER_PORT}" \ +2>&1 >${FD_LOG_DIR}/nohup & wait_for_health ${D_PORT} diff --git a/examples/splitwise/start_v1_tp2.sh b/examples/splitwise/start_v1_tp2.sh new file mode 100644 index 00000000000..88cd24d75fa --- /dev/null +++ b/examples/splitwise/start_v1_tp2.sh @@ -0,0 +1,82 @@ +#!/bin/bash +set -e + +# Test splitwise deployment +# There are two methods for splitwise deployment: +# v0: using splitwise_scheduler or dp_scheduler +# v1: using local_scheduler + router + +# prepare environment +export MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle" +export FD_DEBUG=1 +export ENABLE_V1_KVCACHE_SCHEDULER=1 +export KVCACHE_GDRCOPY_FLUSH_ENABLE=1 + +SCRIPT_PATH=$(readlink -f "$0") +SCRIPT_DIR=$(dirname "$SCRIPT_PATH") +source ${SCRIPT_DIR}/utils.sh + +unset http_proxy && unset https_proxy + +P_PORT=52400 +D_PORT=52500 +ROUTER_PORT=52700 +LOG_DATE=$(date +%Y%m%d_%H%M%S) + +ports=($P_PORT $D_PORT $ROUTER_PORT) +check_ports "${ports[@]}" || { + echo "❌ Some ports are in use. Please release them." + exit 1 +} + +# start router +export FD_LOG_DIR="log/$LOG_DATE/router" +rm -rf ${FD_LOG_DIR} && mkdir -p ${FD_LOG_DIR} + +nohup python -m fastdeploy.router.launch \ + --port ${ROUTER_PORT} \ + --splitwise \ + 2>&1 >${FD_LOG_DIR}/nohup & + +# start prefill +export CUDA_VISIBLE_DEVICES=0,1 +export FD_LOG_DIR="log/$LOG_DATE/prefill" +rm -rf ${FD_LOG_DIR} && mkdir -p ${FD_LOG_DIR} + +nohup python -m fastdeploy.entrypoints.openai.api_server \ + --model ${MODEL_NAME} \ + --port "${P_PORT}" \ + --tensor-parallel-size 2 \ + --splitwise-role "prefill" \ + --router "0.0.0.0:${ROUTER_PORT}" \ +2>&1 >${FD_LOG_DIR}/nohup & + +wait_for_health ${P_PORT} + +# start decode +export CUDA_VISIBLE_DEVICES=2,3 +export FD_LOG_DIR="log/$LOG_DATE/decode" +rm -rf ${FD_LOG_DIR} && mkdir -p ${FD_LOG_DIR} + +nohup python -m fastdeploy.entrypoints.openai.api_server \ + --model ${MODEL_NAME} \ + --port "${D_PORT}" \ + --tensor-parallel-size 2 \ + --splitwise-role "decode" \ + --router "0.0.0.0:${ROUTER_PORT}" \ +2>&1 >${FD_LOG_DIR}/nohup & + +wait_for_health ${D_PORT} + +# send request +sleep 10 # make sure server is registered to router +echo "send request..." +curl -X POST "http://0.0.0.0:${ROUTER_PORT}/v1/chat/completions" \ +-H "Content-Type: application/json" \ +-d '{ + "messages": [ + {"role": "user", "content": "hello"} + ], + "max_tokens": 100, + "stream": false +}' diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index c759b8feef5..1b2873323ca 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -539,19 +539,21 @@ def __post_init__(self): def post_init_all_ports(self): def post_init_ports(name: str, ports: list, num_total_ports: int): - num_expected_ports = num_total_ports + ports = parse_ports(ports) + num_cur_dp_ports = num_total_ports if envs.FD_ENABLE_MULTI_API_SERVER: - num_expected_ports //= self.data_parallel_size + num_cur_dp_ports //= self.data_parallel_size if ports is None: - ports = find_free_ports(num_ports=num_expected_ports) + ports = find_free_ports(num_ports=num_cur_dp_ports) console_logger.info(f"Parameter `{name}` is not specified, found available ports for use: {ports}") else: assert ( len(ports) == num_total_ports ), f"Parameter `{name}` should have {num_total_ports} ports, got {len(ports)}." - ports = parse_ports(ports) for port in ports: assert is_port_available("0.0.0.0", port), f"Parameter `{name}`:{port} is already in use." + + console_logger.debug(f"post init {name}: {ports}") return ports num_nodes = len(self.ips) if self.ips else 1 diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py index 019a44805a6..f25cd9457a7 100644 --- a/fastdeploy/engine/common_engine.py +++ b/fastdeploy/engine/common_engine.py @@ -107,6 +107,7 @@ def __init__(self, cfg, start_queue=True): self.start_worker_queue_service(start_queue) os.environ["INFERENCE_MSG_QUEUE_ID"] = str(self.cfg.parallel_config.local_engine_worker_queue_port) + llm_logger.info(f"INFERENCE_MSG_QUEUE_ID: {str(self.cfg.parallel_config.local_engine_worker_queue_port)}") self.split_connector = SplitwiseConnector(cfg, self.engine_worker_queue, self.resource_manager) self.token_processor = TokenProcessor( diff --git a/fastdeploy/entrypoints/openai/multi_api_server.py b/fastdeploy/entrypoints/openai/multi_api_server.py index a34cb137a9a..e16d131f724 100644 --- a/fastdeploy/entrypoints/openai/multi_api_server.py +++ b/fastdeploy/entrypoints/openai/multi_api_server.py @@ -20,7 +20,7 @@ import sys import time -from fastdeploy.utils import get_logger, is_port_available +from fastdeploy.utils import find_free_ports, get_logger, is_port_available logger = get_logger("multi_api_server", "multi_api_server.log") @@ -28,16 +28,23 @@ def start_servers(server_count, server_args, ports, metrics_ports, controller_ports): processes = [] logger.info(f"Starting servers on ports: {ports} with args: {server_args} and metrics ports: {metrics_ports}") + port_idx = {} for i in range(len(server_args)): if server_args[i] == "--engine-worker-queue-port": - engine_worker_queue_port = server_args[i + 1].split(",") - break + port_idx["engine_worker_queue_port"] = i + 1 + if "engine_worker_queue_port" not in port_idx: + port = find_free_ports(num_ports=server_count) + server_args += ["--engine-worker-queue-port", ",".join(map(str, port))] + port_idx["engine_worker_queue_port"] = len(server_args) - 1 + logger.info(f"No --engine-worker-queue-port specified, using random ports: {port}") + engine_worker_queue_port = server_args[port_idx["engine_worker_queue_port"]].split(",") + if not check_param(engine_worker_queue_port, server_count): + return + if not check_param(ports, server_count): return if not check_param(metrics_ports, server_count): return - if not check_param(engine_worker_queue_port, server_count): - return if controller_ports != "-1": controller_ports = controller_ports.split(",") if not check_param(controller_ports, server_count): @@ -45,6 +52,7 @@ def start_servers(server_count, server_args, ports, metrics_ports, controller_po else: controller_ports = [-1] * server_count # check_param(server_args, server_count) + logger.info(f"Modified server_args: {server_args}") for i in range(server_count): port = int(ports[i]) metrics_port = int(metrics_ports[i]) diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py index 6bd8da02b24..752bcd340aa 100644 --- a/fastdeploy/worker/gcu_model_runner.py +++ b/fastdeploy/worker/gcu_model_runner.py @@ -103,7 +103,7 @@ def __init__( self.forward_meta: ForwardMeta = None # Postprocess Env params - os.environ["INFERENCE_MSG_QUEUE_ID"] = str(self.parallel_config.engine_worker_queue_port) + os.environ["INFERENCE_MSG_QUEUE_ID"] = str(self.parallel_config.local_engine_worker_queue_port) def exist_prefill(self): """ diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index b2d9c5486e2..2e6c4f834bd 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -199,8 +199,8 @@ def __init__( self.forward_meta: ForwardMeta = None # Postprocess Env params - os.environ["INFERENCE_MSG_QUEUE_ID"] = str(self.parallel_config.engine_worker_queue_port) - logger.info(f"queue id is {str(self.parallel_config.engine_worker_queue_port)}") + os.environ["INFERENCE_MSG_QUEUE_ID"] = str(self.parallel_config.local_engine_worker_queue_port) + logger.info(f"queue id is {str(self.parallel_config.local_engine_worker_queue_port)}") self.zmq_client = None self.async_output_queue = None @@ -1505,7 +1505,7 @@ def initialize_kv_cache(self, profile: bool = False) -> None: name="cache_ready_signal", array=cache_ready_signal_data, dtype=np.int32, - suffix=self.parallel_config.engine_worker_queue_port, + suffix=self.parallel_config.local_engine_worker_queue_port, create=False, ) diff --git a/fastdeploy/worker/hpu_model_runner.py b/fastdeploy/worker/hpu_model_runner.py index 5811a1516d2..6b6650a05f2 100644 --- a/fastdeploy/worker/hpu_model_runner.py +++ b/fastdeploy/worker/hpu_model_runner.py @@ -386,7 +386,7 @@ def __init__( self.is_hpu_perf_breakdown_sync_mode = int(os.environ.get("HPU_PERF_BREAKDOWN_SYNC_MODE", 1)) == 1 # Postprocess Env params os.environ["INFERENCE_MSG_QUEUE_ID"] = str( - self.local_rank + int(self.parallel_config.engine_worker_queue_port) + self.local_rank + int(self.parallel_config.local_engine_worker_queue_port) ) if int(os.environ.get("HABANA_PROFILE", 0)) == 1: diff --git a/fastdeploy/worker/metax_model_runner.py b/fastdeploy/worker/metax_model_runner.py index 3038a34fc2b..e2610f18c6e 100644 --- a/fastdeploy/worker/metax_model_runner.py +++ b/fastdeploy/worker/metax_model_runner.py @@ -151,8 +151,8 @@ def __init__( self.forward_meta: ForwardMeta = None # Postprocess Env params - os.environ["INFERENCE_MSG_QUEUE_ID"] = str(self.parallel_config.engine_worker_queue_port) - logger.info(f"queue id is {str(self.parallel_config.engine_worker_queue_port)}") + os.environ["INFERENCE_MSG_QUEUE_ID"] = str(self.parallel_config.local_engine_worker_queue_port) + logger.info(f"queue id is {str(self.parallel_config.local_engine_worker_queue_port)}") self.zmq_client = None self.async_output_queue = None diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 74dc9efe9ba..ced67bb699c 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -217,7 +217,7 @@ def init_health_status(self) -> None: name="worker_healthy_live_signal", array=workers_alive, dtype=np.int32, - suffix=self.parallel_config.engine_worker_queue_port, + suffix=self.parallel_config.local_engine_worker_queue_port, create=False, ) local_rank = self.local_rank % self.parallel_config.tensor_parallel_size @@ -229,7 +229,7 @@ def init_health_status(self) -> None: name="model_weights_status", array=workers_model_weights, dtype=np.int32, - suffix=self.parallel_config.engine_worker_queue_port, + suffix=self.parallel_config.local_engine_worker_queue_port, create=False, ) @@ -239,7 +239,7 @@ def init_health_status(self) -> None: name="exist_task_signal", array=workers_exist_task, dtype=np.int32, - suffix=self.parallel_config.engine_worker_queue_port, + suffix=self.parallel_config.local_engine_worker_queue_port, create=False, ) @@ -249,7 +249,7 @@ def init_health_status(self) -> None: name="exist_swapped_task_signal", array=workers_swapped_task, dtype=np.int32, - suffix=self.parallel_config.engine_worker_queue_port, + suffix=self.parallel_config.local_engine_worker_queue_port, create=False, ) @@ -259,7 +259,7 @@ def init_health_status(self) -> None: name="exist_prefill_task_signal", array=exist_prefill_task_signal_data, dtype=np.int32, - suffix=self.parallel_config.engine_worker_queue_port, + suffix=self.parallel_config.local_engine_worker_queue_port, create=False, ) @@ -304,11 +304,11 @@ def _init_eplb_signal(self): rank=self.local_rank, ep_size=self.ranks, fd_config=self.fd_config, - ipc_signal_suffix=self.parallel_config.engine_worker_queue_port, + ipc_signal_suffix=self.parallel_config.local_engine_worker_queue_port, ) dp_ipc_signal_suffix = ( - f"{self.parallel_config.engine_worker_queue_port}_dp{self.parallel_config.local_data_parallel_id}" + f"{self.parallel_config.local_engine_worker_queue_port}_dp{self.parallel_config.local_data_parallel_id}" ) if local_rank == 0: # master rank0 signal_update_weight_from_tensor = np.zeros([1], dtype=np.int32) @@ -355,7 +355,7 @@ def _init_eplb_signal(self): [MODEL_MAIN_NAME], self.local_rank, self.ranks, - shm_uuid=self.parallel_config.engine_worker_queue_port, + shm_uuid=self.parallel_config.local_engine_worker_queue_port, eplb_config=self.eplb_config, logger=logger, ) @@ -470,7 +470,7 @@ def event_loop_normal(self) -> None: self.model_weights_status, # model_weights_signal self.worker.model_runner, - self.parallel_config.engine_worker_queue_port, + self.parallel_config.local_engine_worker_queue_port, ) logger.info(f"current task queue data: {self.task_queue.num_tasks()}") self.task_queue.clear_data() @@ -594,10 +594,10 @@ def start_task_queue_service(self): if not envs.FD_ENGINE_TASK_QUEUE_WITH_SHM: task_address = ( self.parallel_config.pod_ip, - self.parallel_config.engine_worker_queue_port, + self.parallel_config.local_engine_worker_queue_port, ) else: - task_address = f"/dev/shm/fd_task_queue_{self.parallel_config.engine_worker_queue_port}.sock" + task_address = f"/dev/shm/fd_task_queue_{self.parallel_config.local_engine_worker_queue_port}.sock" logger.info(f"connect task queue address {task_address}") self.task_queue = TaskQueue( address=task_address, @@ -1000,11 +1000,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: structured_outputs_config=structured_outputs_config, eplb_config=eplb_config, ) - - if args.load_strategy != "meta": - fd_config.parallel_config.engine_worker_queue_port = fd_config.parallel_config.engine_worker_queue_port[ - parallel_config.local_data_parallel_id - ] + logger.info(f"parallel_config.local_engine_worker_queue_port {parallel_config.local_engine_worker_queue_port}") update_fd_config_for_mm(fd_config) if fd_config.load_config.load_choices == "default_v1" and not v1_loader_support(fd_config): From 47a86a8c9f8263662a837bac039d46d2f2f0ee00 Mon Sep 17 00:00:00 2001 From: liyonghua0910 Date: Tue, 9 Dec 2025 16:18:58 +0800 Subject: [PATCH 03/17] [fix] fix rdma port for cache manager/messager --- fastdeploy/cache_manager/prefix_cache_manager.py | 4 ++-- .../cache_manager/transfer_factory/rdma_cache_transfer.py | 2 +- fastdeploy/utils.py | 5 ++--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py index af6a6eef595..ca8148af90f 100644 --- a/fastdeploy/cache_manager/prefix_cache_manager.py +++ b/fastdeploy/cache_manager/prefix_cache_manager.py @@ -277,7 +277,7 @@ def launch_cache_manager( + f" --ipc_suffix {ipc_suffix}" + f" --protocol {cache_config.cache_transfer_protocol}" + f" --local_data_parallel_id {self.local_data_parallel_id}" - + f" --rdma_port {cache_config.rdma_comm_ports[0] if cache_config.rdma_comm_ports is not None else '0'}" + + f" --rdma_port {cache_config.rdma_comm_ports[i] if cache_config.rdma_comm_ports is not None else '0'}" + f" --speculative_config '{self.speculative_config.to_json_string()}'" + (" --create_cache_tensor" if create_cache_tensor else "") + f" >{log_dir}/launch_cache_transfer_manager_tprank{i}.log 2>&1" @@ -370,7 +370,7 @@ def launch_cache_messager( + f" --protocol {cache_config.cache_transfer_protocol}" + f" --local_data_parallel_id {self.local_data_parallel_id}" + f" --ipc_suffix {ipc_suffix}" - + f" --rdma_port {cache_config.rdma_comm_ports[0] if cache_config.rdma_comm_ports is not None else '0'}" + + f" --rdma_port {cache_config.rdma_comm_ports[i] if cache_config.rdma_comm_ports is not None else '0'}" + f" --speculative_config '{self.speculative_config.to_json_string()}'" + f" >{log_dir}/launch_cache_messager_tprank{i}.log 2>&1" ) diff --git a/fastdeploy/cache_manager/transfer_factory/rdma_cache_transfer.py b/fastdeploy/cache_manager/transfer_factory/rdma_cache_transfer.py index 62fc2ce05b0..8e759bd15a4 100644 --- a/fastdeploy/cache_manager/transfer_factory/rdma_cache_transfer.py +++ b/fastdeploy/cache_manager/transfer_factory/rdma_cache_transfer.py @@ -84,7 +84,7 @@ def __init__( try: import rdma_comm - except: + except ImportError: raise RuntimeError( "The installation of the RDMA library failed. Confirm whether your network card supports RDMA transmission." ) diff --git a/fastdeploy/utils.py b/fastdeploy/utils.py index 2c9c87a82eb..398ba43f9ac 100644 --- a/fastdeploy/utils.py +++ b/fastdeploy/utils.py @@ -627,12 +627,11 @@ def is_port_available(host, port): try: s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) s.bind((host, port)) + return True except OSError as e: if e.errno == errno.EADDRINUSE: return False - finally: - s.close() - return True + raise def find_free_ports( From f07c2e03e797c6f0232660ded62a25df134d7584 Mon Sep 17 00:00:00 2001 From: liyonghua0910 Date: Tue, 9 Dec 2025 19:13:14 +0800 Subject: [PATCH 04/17] [fix] temporarily cancel port availability check to see if it can pass ci test --- fastdeploy/engine/args_utils.py | 7 +++---- fastdeploy/utils.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index e7612951dda..7f5a3ca6de4 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -43,12 +43,11 @@ ) from fastdeploy.platforms import current_platform from fastdeploy.scheduler.config import SchedulerConfig -from fastdeploy.utils import ( +from fastdeploy.utils import ( # is_port_available, DeprecatedOptionWarning, FlexibleArgumentParser, console_logger, find_free_ports, - is_port_available, parse_ports, parse_quantization, ) @@ -554,8 +553,8 @@ def post_init_ports(name: str, ports: list, num_total_ports: int): assert ( len(ports) == num_total_ports ), f"Parameter `{name}` should have {num_total_ports} ports, got {len(ports)}." - for port in ports: - assert is_port_available("0.0.0.0", port), f"Parameter `{name}`:{port} is already in use." + # for port in ports: + # assert is_port_available("0.0.0.0", port), f"Parameter `{name}`:{port} is already in use." console_logger.debug(f"post init {name}: {ports}") return ports diff --git a/fastdeploy/utils.py b/fastdeploy/utils.py index 398ba43f9ac..f8330c35867 100644 --- a/fastdeploy/utils.py +++ b/fastdeploy/utils.py @@ -631,7 +631,7 @@ def is_port_available(host, port): except OSError as e: if e.errno == errno.EADDRINUSE: return False - raise + return True def find_free_ports( From cb2e6181baf1efc4265ac91e2db564c9af51061d Mon Sep 17 00:00:00 2001 From: liyonghua0910 Date: Tue, 9 Dec 2025 20:48:08 +0800 Subject: [PATCH 05/17] [feat] simplify args for multi api server --- examples/splitwise/start_v0_tp1.sh | 14 ++-- examples/splitwise/start_v1_dp2.sh | 45 +----------- examples/splitwise/start_v1_tp1.sh | 2 - .../entrypoints/openai/multi_api_server.py | 73 ++++++++++++++++--- fastdeploy/worker/worker_process.py | 6 +- 5 files changed, 71 insertions(+), 69 deletions(-) diff --git a/examples/splitwise/start_v0_tp1.sh b/examples/splitwise/start_v0_tp1.sh index c91bcf9d302..a41bc310f47 100644 --- a/examples/splitwise/start_v0_tp1.sh +++ b/examples/splitwise/start_v0_tp1.sh @@ -22,12 +22,12 @@ if [ -z "${KVCACHE_RDMA_NICS}" ]; then fi unset http_proxy && unset https_proxy -rm -rf log_* -source ./utils.sh +source ${SCRIPT_DIR}/utils.sh P_PORT=52400 D_PORT=52500 -REDIS_PORT="${REDIS_PORT:-56388}" +REDIS_PORT="${REDIS_PORT:-6379}" +LOG_DATE=$(date +%Y%m%d_%H%M%S) ports=( $P_PORT $((P_PORT + 1)) $((P_PORT + 2)) $((P_PORT + 3)) $((P_PORT + 4)) $((P_PORT + 5)) @@ -51,8 +51,8 @@ sleep 1 # start prefill export CUDA_VISIBLE_DEVICES=0 -export FD_LOG_DIR="log_prefill" -mkdir -p ${FD_LOG_DIR} +export FD_LOG_DIR="log/$LOG_DATE/prefill" +rm -rf ${FD_LOG_DIR} && mkdir -p ${FD_LOG_DIR} nohup python -m fastdeploy.entrypoints.openai.api_server \ --model ${MODEL_NAME} \ @@ -76,8 +76,8 @@ wait_for_health ${P_PORT} # start decode export CUDA_VISIBLE_DEVICES=1 -export FD_LOG_DIR="log_decode" -mkdir -p ${FD_LOG_DIR} +export FD_LOG_DIR="log/$LOG_DATE/decode" +rm -rf ${FD_LOG_DIR} && mkdir -p ${FD_LOG_DIR} nohup python -m fastdeploy.entrypoints.openai.api_server \ --model ${MODEL_NAME} \ diff --git a/examples/splitwise/start_v1_dp2.sh b/examples/splitwise/start_v1_dp2.sh index cfcb57126b3..3561dfb2016 100644 --- a/examples/splitwise/start_v1_dp2.sh +++ b/examples/splitwise/start_v1_dp2.sh @@ -9,25 +9,16 @@ set -e MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle" DATA_PARALLEL_SIZE=2 TENSOR_PARALLEL_SIZE=1 -NUM_GPUS=$(($DATA_PARALLEL_SIZE * $TENSOR_PARALLEL_SIZE)) LOG_DATE=$(date +%Y%m%d_%H%M%S) export FD_DEBUG=1 -export ENABLE_V1_KVCACHE_SCHEDULER=1 -export KVCACHE_GDRCOPY_FLUSH_ENABLE=1 export FD_ENABLE_MULTI_API_SERVER=1 SCRIPT_PATH=$(readlink -f "$0") SCRIPT_DIR=$(dirname "$SCRIPT_PATH") -export $(bash ${SCRIPT_DIR}/../../scripts/get_rdma_nics.sh gpu) -echo "KVCACHE_RDMA_NICS:${KVCACHE_RDMA_NICS}" -if [ -z "${KVCACHE_RDMA_NICS}" ]; then - echo "KVCACHE_RDMA_NICS is empty, please check the output of get_rdma_nics.sh" - exit 1 -fi +source ${SCRIPT_DIR}/utils.sh unset http_proxy && unset https_proxy -source ${SCRIPT_DIR}/utils.sh # start router ROUTER_PORT=$(get_free_ports 1) @@ -46,17 +37,7 @@ sleep 1 # start prefill P_SERVER_PORTS=$(get_free_ports $DATA_PARALLEL_SIZE) -P_METRICS_PORTS=$(get_free_ports $DATA_PARALLEL_SIZE) -P_ENGINE_WORKER_QUEUE_PORTS=$(get_free_ports $DATA_PARALLEL_SIZE) -P_CACHE_QUEUE_PORTS=$(get_free_ports $DATA_PARALLEL_SIZE) -P_RDMA_COMM_PORTS=$(get_free_ports $NUM_GPUS) -P_PD_COMM_PORTS=$(get_free_ports $DATA_PARALLEL_SIZE) echo P_SERVER_PORTS: $P_SERVER_PORTS -echo P_METRICS_PORTS: $P_METRICS_PORTS -echo P_ENGINE_WORKER_QUEUE_PORTS: $P_ENGINE_WORKER_QUEUE_PORTS -echo P_CACHE_QUEUE_PORTS: $P_CACHE_QUEUE_PORTS -echo P_RDMA_COMM_PORTS: $P_RDMA_COMM_PORTS -echo P_PD_COMM_PORTS: $P_PD_COMM_PORTS export CUDA_VISIBLE_DEVICES="0,1" export FD_LOG_DIR="log/$LOG_DATE/prefill" @@ -66,17 +47,11 @@ mkdir -p ${FD_LOG_DIR} nohup python -m fastdeploy.entrypoints.openai.multi_api_server \ --num-servers ${DATA_PARALLEL_SIZE}\ --ports ${P_SERVER_PORTS} \ - --metrics-port ${P_METRICS_PORTS} \ --args --model ${MODEL_NAME} \ - --engine-worker-queue-port ${P_ENGINE_WORKER_QUEUE_PORTS} \ - --cache-queue-port ${P_CACHE_QUEUE_PORTS} \ --max-model-len 32768 \ --data-parallel-size ${DATA_PARALLEL_SIZE} \ --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \ --splitwise-role "prefill" \ - --cache-transfer-protocol "rdma" \ - --rdma-comm-ports ${P_RDMA_COMM_PORTS} \ - --pd-comm-port ${P_PD_COMM_PORTS} \ --router "0.0.0.0:${ROUTER_PORT}" \ 2>&1 >${FD_LOG_DIR}/nohup & @@ -85,19 +60,9 @@ wait_for_health ${P_SERVER_PORTS} # start decode D_SERVER_PORTS=$(get_free_ports $DATA_PARALLEL_SIZE) -D_ENGINE_WORKER_QUEUE_PORTS=$(get_free_ports $DATA_PARALLEL_SIZE) -D_CACHE_QUEUE_PORTS=$(get_free_ports $DATA_PARALLEL_SIZE) -D_METRICS_PORTS=$(get_free_ports $DATA_PARALLEL_SIZE) -D_RDMA_COMM_PORTS=$(get_free_ports $NUM_GPUS) -D_PD_COMM_PORTS=$(get_free_ports $DATA_PARALLEL_SIZE) echo D_SERVER_PORTS: $D_SERVER_PORTS -echo D_ENGINE_WORKER_QUEUE_PORTS: $D_ENGINE_WORKER_QUEUE_PORTS -echo D_CACHE_QUEUE_PORTS: $D_CACHE_QUEUE_PORTS -echo D_METRICS_PORTS: $D_METRICS_PORTS -echo D_RDMA_COMM_PORTS: $D_RDMA_COMM_PORTS -echo D_PD_COMM_PORTS: $D_PD_COMM_PORTS -export CUDA_VISIBLE_DEVICES="2,3" +export CUDA_VISIBLE_DEVICES="4,5" export FD_LOG_DIR="log/$LOG_DATE/decode" rm -rf $FD_LOG_DIR mkdir -p ${FD_LOG_DIR} @@ -105,17 +70,11 @@ mkdir -p ${FD_LOG_DIR} nohup python -m fastdeploy.entrypoints.openai.multi_api_server \ --num-servers ${DATA_PARALLEL_SIZE}\ --ports ${D_SERVER_PORTS} \ - --metrics-port ${D_METRICS_PORTS} \ --args --model ${MODEL_NAME} \ - --engine-worker-queue-port ${D_ENGINE_WORKER_QUEUE_PORTS} \ - --cache-queue-port ${D_CACHE_QUEUE_PORTS} \ --max-model-len 32768 \ --data-parallel-size ${DATA_PARALLEL_SIZE} \ --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \ --splitwise-role "decode" \ - --cache-transfer-protocol "rdma" \ - --rdma-comm-ports ${D_RDMA_COMM_PORTS} \ - --pd-comm-port ${D_PD_COMM_PORTS} \ --router "0.0.0.0:${ROUTER_PORT}" \ 2>&1 >${FD_LOG_DIR}/nohup & diff --git a/examples/splitwise/start_v1_tp1.sh b/examples/splitwise/start_v1_tp1.sh index 3d081bf7c83..611be803457 100644 --- a/examples/splitwise/start_v1_tp1.sh +++ b/examples/splitwise/start_v1_tp1.sh @@ -9,8 +9,6 @@ set -e # prepare environment export MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle" export FD_DEBUG=1 -export ENABLE_V1_KVCACHE_SCHEDULER=1 -export KVCACHE_GDRCOPY_FLUSH_ENABLE=1 SCRIPT_PATH=$(readlink -f "$0") SCRIPT_DIR=$(dirname "$SCRIPT_PATH") diff --git a/fastdeploy/entrypoints/openai/multi_api_server.py b/fastdeploy/entrypoints/openai/multi_api_server.py index e16d131f724..40e268e9d2a 100644 --- a/fastdeploy/entrypoints/openai/multi_api_server.py +++ b/fastdeploy/entrypoints/openai/multi_api_server.py @@ -20,18 +20,33 @@ import sys import time +from fastdeploy.platforms import current_platform from fastdeploy.utils import find_free_ports, get_logger, is_port_available logger = get_logger("multi_api_server", "multi_api_server.log") -def start_servers(server_count, server_args, ports, metrics_ports, controller_ports): +def start_servers( + server_count=None, + device_count=None, + server_args=None, + ports=None, + metrics_ports=None, + controller_ports=None, +): processes = [] logger.info(f"Starting servers on ports: {ports} with args: {server_args} and metrics ports: {metrics_ports}") port_idx = {} for i in range(len(server_args)): if server_args[i] == "--engine-worker-queue-port": port_idx["engine_worker_queue_port"] = i + 1 + if server_args[i] == "--cache-queue-port": + port_idx["cache_queue_port"] = i + 1 + if server_args[i] == "--pd-comm-port": + port_idx["pd_comm_port"] = i + 1 + if server_args[i] == "--rdma-comm-port": + port_idx["rdma_comm_port"] = i + 1 + if "engine_worker_queue_port" not in port_idx: port = find_free_ports(num_ports=server_count) server_args += ["--engine-worker-queue-port", ",".join(map(str, port))] @@ -41,21 +56,51 @@ def start_servers(server_count, server_args, ports, metrics_ports, controller_po if not check_param(engine_worker_queue_port, server_count): return - if not check_param(ports, server_count): + if "cache_queue_port" not in port_idx: + port = find_free_ports(num_ports=server_count) + server_args += ["--cache-queue-port", ",".join(map(str, port))] + port_idx["cache_queue_port"] = len(server_args) - 1 + logger.info(f"No --cache-queue-port specified, using random ports: {port}") + cache_queue_port = server_args[port_idx["cache_queue_port"]].split(",") + if not check_param(cache_queue_port, server_count): + return + + if "pd_comm_port" not in port_idx: + port = find_free_ports(num_ports=server_count) + server_args += ["--pd-comm-port", ",".join(map(str, port))] + port_idx["pd_comm_port"] = len(server_args) - 1 + logger.info(f"No --pd-comm-port specified, using random ports: {port}") + pd_comm_port = server_args[port_idx["pd_comm_port"]].split(",") + if not check_param(pd_comm_port, server_count): return - if not check_param(metrics_ports, server_count): + + if "rdma_comm_port" not in port_idx: + port = find_free_ports(num_ports=device_count) + server_args += ["--rdma-comm-port", ",".join(map(str, port))] + port_idx["rdma_comm_port"] = len(server_args) - 1 + logger.info(f"No --rdma-comm-port specified, using random ports: {port}") + rdma_comm_port = server_args[port_idx["rdma_comm_port"]].split(",") + if not check_param(rdma_comm_port, device_count): return + + if not check_param(ports, server_count): + return + + if metrics_ports != "-1": + metrics_ports = metrics_ports.split(",") + if not check_param(metrics_ports, server_count): + return + if controller_ports != "-1": controller_ports = controller_ports.split(",") if not check_param(controller_ports, server_count): return else: controller_ports = [-1] * server_count - # check_param(server_args, server_count) + logger.info(f"Modified server_args: {server_args}") for i in range(server_count): port = int(ports[i]) - metrics_port = int(metrics_ports[i]) controller_port = int(controller_ports[i]) env = os.environ.copy() @@ -67,13 +112,13 @@ def start_servers(server_count, server_args, ports, metrics_ports, controller_po *server_args, "--port", str(port), - "--metrics-port", - str(metrics_port), "--controller-port", str(controller_port), "--local-data-parallel-id", str(i), ] + if metrics_ports != "-1": + cmd += [metrics_ports[i], "--controller-port"] # 启动子进程 proc = subprocess.Popen(cmd, env=env) @@ -97,21 +142,25 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument("--ports", default="8000,8002", type=str, help="ports to the http server") parser.add_argument("--num-servers", default=2, type=int, help="number of workers") - parser.add_argument("--metrics-ports", default="8800,8802", type=str, help="ports for metrics server") + parser.add_argument("--metrics-ports", default="-1", type=str, help="ports for metrics server") parser.add_argument("--controller-ports", default="-1", type=str, help="ports for controller server port") parser.add_argument("--args", nargs=argparse.REMAINDER, help="remaining arguments are passed to api_server.py") args = parser.parse_args() logger.info(f"Starting {args.num_servers} servers on ports: {args.ports} with args: {args.args}") - # check_param(args.ports, args.num_servers) - # check_param(args.metrics_ports, args.num_servers) - # check_param(args.args.engine_worker_queue_port, args.num_servers) + + device_count = 0 + if current_platform.is_cuda(): + device_count = len(os.getenv("CUDA_VISIBLE_DEVICES", "0,1,2,3,4,5,6,7").split(",")) + elif current_platform.is_xpu(): + device_count = len(os.getenv("XPU_VISIBLE_DEVICES", "0,1,2,3,4,5,6,7").split(",")) processes = start_servers( server_count=args.num_servers, + device_count=device_count, server_args=args.args, ports=args.ports.split(","), - metrics_ports=args.metrics_ports.split(","), + metrics_ports=args.metrics_ports, controller_ports=args.controller_ports, ) diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index df2f673a1df..66a933496cf 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -174,11 +174,7 @@ def init_health_status(self) -> None: model_weights_status: """ self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8 - if ( - self.parallel_config.enable_expert_parallel - and self.parallel_config.data_parallel_size > 1 - and not envs.FD_ENABLE_MULTI_API_SERVER - ): + if self.parallel_config.data_parallel_size > 1 and not envs.FD_ENABLE_MULTI_API_SERVER: launched_expert_service_signal_data = np.zeros( shape=[self.parallel_config.data_parallel_size // self.fd_config.nnode], dtype=np.int32 ) From 9c9a660e611038857445669726be2adc71c48214 Mon Sep 17 00:00:00 2001 From: liyonghua0910 Date: Wed, 10 Dec 2025 15:58:55 +0800 Subject: [PATCH 06/17] [fix] fix dp --- .../cache_manager/prefix_cache_manager.py | 10 +- fastdeploy/config.py | 16 ++- fastdeploy/engine/common_engine.py | 109 +++++++++--------- fastdeploy/engine/engine.py | 14 +-- fastdeploy/engine/expert_service.py | 33 ++++-- fastdeploy/splitwise/splitwise_connector.py | 4 +- fastdeploy/worker/gpu_model_runner.py | 1 + 7 files changed, 106 insertions(+), 81 deletions(-) diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py index ca8148af90f..bd54a4249d3 100644 --- a/fastdeploy/cache_manager/prefix_cache_manager.py +++ b/fastdeploy/cache_manager/prefix_cache_manager.py @@ -184,7 +184,7 @@ def launch_cache_manager( ) self.cache_task_queue = EngineCacheQueue( - address=(pod_ip, cache_config.cache_queue_port), + address=(pod_ip, cache_config.local_cache_queue_port), authkey=b"cache_queue_service", is_server=False, num_client=tensor_parallel_size, @@ -269,7 +269,7 @@ def launch_cache_manager( + f" --cache_dtype {cache_config.cache_dtype}" + f" --key_cache_shape {key_cache_shape}" + val_cache_arg_str - + f" --cache_queue_port {cache_config.cache_queue_port}" + + f" --cache_queue_port {cache_config.local_cache_queue_port}" + f" --enable_splitwise {int(self.enable_splitwise)}" + f" --pod_ip {pod_ip}" + f" --engine_worker_queue_port {engine_worker_queue_port}" @@ -277,7 +277,7 @@ def launch_cache_manager( + f" --ipc_suffix {ipc_suffix}" + f" --protocol {cache_config.cache_transfer_protocol}" + f" --local_data_parallel_id {self.local_data_parallel_id}" - + f" --rdma_port {cache_config.rdma_comm_ports[i] if cache_config.rdma_comm_ports is not None else '0'}" + + f" --rdma_port {cache_config.local_rdma_comm_ports[i] if cache_config.local_rdma_comm_ports is not None else '0'}" + f" --speculative_config '{self.speculative_config.to_json_string()}'" + (" --create_cache_tensor" if create_cache_tensor else "") + f" >{log_dir}/launch_cache_transfer_manager_tprank{i}.log 2>&1" @@ -365,12 +365,12 @@ def launch_cache_messager( + f" --key_cache_shape {key_cache_shape}" + val_cache_arg_str + f" --pod_ip {pod_ip}" - + f" --cache_queue_port {cache_config.cache_queue_port}" + + f" --cache_queue_port {cache_config.local_cache_queue_port}" + f" --engine_worker_queue_port {engine_worker_queue_port}" + f" --protocol {cache_config.cache_transfer_protocol}" + f" --local_data_parallel_id {self.local_data_parallel_id}" + f" --ipc_suffix {ipc_suffix}" - + f" --rdma_port {cache_config.rdma_comm_ports[i] if cache_config.rdma_comm_ports is not None else '0'}" + + f" --rdma_port {cache_config.local_rdma_comm_ports[i] if cache_config.local_rdma_comm_ports is not None else '0'}" + f" --speculative_config '{self.speculative_config.to_json_string()}'" + f" >{log_dir}/launch_cache_messager_tprank{i}.log 2>&1" ) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index b0b65912848..78dc5899bf6 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -1266,11 +1266,14 @@ def __init__(self, args): self.model_cfg = None self.enable_chunked_prefill = False self.rdma_comm_ports = None + self.local_rdma_comm_ports = None self.cache_transfer_protocol = None self.pd_comm_port = None + self.local_pd_comm_port = None self.enable_prefix_caching = False self.enable_ssd_cache = False self.cache_queue_port = None + self.local_cache_queue_port = None self.swap_space = None self.max_encoder_cache = None self.max_processor_cache = None @@ -1761,6 +1764,9 @@ def postprocess(self): else: raise NotImplementedError + self.postprocess_devices_and_ports() + + def postprocess_devices_and_ports(self): # get devices and ports for current dp self.local_device_ids = self.parallel_config.device_ids.split(",")[ self.parallel_config.local_data_parallel_id @@ -1770,17 +1776,17 @@ def postprocess(self): self.parallel_config.local_engine_worker_queue_port = self.parallel_config.engine_worker_queue_port[ self.parallel_config.local_data_parallel_id ] - self.cache_config.cache_queue_port = ( + self.cache_config.local_cache_queue_port = ( self.cache_config.cache_queue_port[self.parallel_config.local_data_parallel_id] if self.cache_config.cache_queue_port else None ) - self.cache_config.pd_comm_port = ( + self.cache_config.local_pd_comm_port = ( self.cache_config.pd_comm_port[self.parallel_config.local_data_parallel_id] if self.cache_config.pd_comm_port else None ) - self.cache_config.rdma_comm_ports = ( + self.cache_config.local_rdma_comm_ports = ( self.cache_config.rdma_comm_ports[ self.parallel_config.local_data_parallel_id * self.parallel_config.tensor_parallel_size : (self.parallel_config.local_data_parallel_id + 1) @@ -1947,8 +1953,8 @@ def init_cache_info(self): "role": self.scheduler_config.splitwise_role, "host_ip": self.host_ip, "port": port, - "connector_port": self.cache_config.pd_comm_port, - "rdma_ports": self.cache_config.rdma_comm_ports, + "connector_port": self.cache_config.local_pd_comm_port, + "rdma_ports": self.cache_config.local_rdma_comm_ports, "engine_worker_queue_port": self.parallel_config.local_engine_worker_queue_port, "device_ids": self.local_device_ids, "transfer_protocol": transfer_protocol, diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py index 367fb4d88d4..33d9abe6077 100644 --- a/fastdeploy/engine/common_engine.py +++ b/fastdeploy/engine/common_engine.py @@ -115,7 +115,7 @@ def __init__(self, cfg, start_queue=True, use_async_llm=False): self.start_worker_queue_service(start_queue) os.environ["INFERENCE_MSG_QUEUE_ID"] = str(self.cfg.parallel_config.local_engine_worker_queue_port) - llm_logger.info(f"INFERENCE_MSG_QUEUE_ID: {str(self.cfg.parallel_config.local_engine_worker_queue_port)}") + self.llm_logger.info(f"INFERENCE_MSG_QUEUE_ID: {str(self.cfg.parallel_config.local_engine_worker_queue_port)}") self.split_connector = SplitwiseConnector(cfg, self.engine_worker_queue, self.resource_manager) self.token_processor = TokenProcessor( @@ -202,7 +202,7 @@ def start_worker_service(self, async_llm_pid=None): def check_worker_initialize_status_func(res: dict): res["worker_is_alive"] = True if not self.check_worker_initialize_status(): - llm_logger.error("Failed to launch worker processes, check log/workerlog.* for more details.") + self.llm_logger.error("Failed to launch worker processes, check log/workerlog.* for more details.") res["worker_is_alive"] = False self.check_worker_initialize_status_func_thread = threading.Thread( @@ -232,7 +232,7 @@ def check_worker_initialize_status_func(res: dict): # Worker launched self.check_worker_initialize_status_func_thread.join() if not result_container["worker_is_alive"]: - llm_logger.error("Failed to launch worker processes, check log/workerlog.* for more details.") + self.llm_logger.error("Failed to launch worker processes, check log/workerlog.* for more details.") return False # Start ZMQ service for communication with AsyncLLM @@ -347,37 +347,38 @@ def start_worker_queue_service(self, start_queue): else: address = f"/dev/shm/fd_task_queue_{self.cfg.parallel_config.local_engine_worker_queue_port}.sock" - if start_queue and (self.cfg.host_ip == self.cfg.master_ip or self.cfg.master_ip == "0.0.0.0"): - self.llm_logger.info(f"Starting engine worker queue server service at {address}") - self.engine_worker_queue_server = EngineWorkerQueue( - address=address, - is_server=True, - num_client=self.cfg.parallel_config.tensor_parallel_size, - local_data_parallel_size=self.cfg.parallel_config.data_parallel_size, - ) - # Dynamically updates the port value if an anonymous port is used - if not envs.FD_ENGINE_TASK_QUEUE_WITH_SHM: - self.cfg.parallel_config.local_engine_worker_queue_port = ( - self.engine_worker_queue_server.get_server_port() - ) - address = ( - self.cfg.master_ip, - self.cfg.parallel_config.local_engine_worker_queue_port, + if self.cfg.host_ip == self.cfg.master_ip or self.cfg.master_ip == "0.0.0.0": + if start_queue: + self.llm_logger.info(f"Starting engine worker queue server service at {address}") + self.engine_worker_queue_server = EngineWorkerQueue( + address=address, + is_server=True, + num_client=self.cfg.parallel_config.tensor_parallel_size, + local_data_parallel_size=self.cfg.parallel_config.data_parallel_size, ) + # Dynamically updates the port value if an anonymous port is used + if not envs.FD_ENGINE_TASK_QUEUE_WITH_SHM: + self.cfg.parallel_config.local_engine_worker_queue_port = ( + self.engine_worker_queue_server.get_server_port() + ) + address = ( + self.cfg.master_ip, + self.cfg.parallel_config.local_engine_worker_queue_port, + ) if self.cfg.cache_config.enable_prefix_caching or self.cfg.scheduler_config.splitwise_role != "mixed": + self.llm_logger.info( + f"Starting engine cache queue server service at {self.cfg.cache_config.local_cache_queue_port}" + ) self.cache_task_queue = EngineCacheQueue( - address=( - self.cfg.master_ip, - self.cfg.cache_config.cache_queue_port, - ), + address=(self.cfg.master_ip, self.cfg.cache_config.local_cache_queue_port), authkey=b"cache_queue_service", is_server=True, num_client=self.cfg.parallel_config.tensor_parallel_size, client_id=-1, local_data_parallel_size=self.cfg.parallel_config.data_parallel_size, ) - self.cfg.cache_config.cache_queue_port = self.cache_task_queue.get_server_port() + self.cfg.cache_config.local_cache_queue_port = self.cache_task_queue.get_server_port() self.engine_worker_queue = EngineWorkerQueue( address=address, @@ -720,7 +721,7 @@ def _schedule_request_to_worker(self): # so the same request sent by the decode api server will be ignored continue - llm_logger.debug(f"get tasks from scheduler: {tasks}") + self.llm_logger.debug(f"get tasks from scheduler: {tasks}") if self.cfg.scheduler_config.splitwise_role != "mixed": for task in tasks: task.metrics.ask_decode_resource_start_time = time.time() @@ -931,7 +932,7 @@ def _fetch_request(): get_request_pool.submit(_fetch_request) except RuntimeError as e: if "shutdown" in str(e): - llm_logger.info("Thread pool shutdown detected, exiting scheduler loop") + self.llm_logger.info("Thread pool shutdown detected, exiting scheduler loop") break else: raise @@ -977,7 +978,7 @@ def _fetch_request(): if error_tasks: for request_id, failed in error_tasks: if failed is None: - llm_logger.warning(f"Request {request_id} has no error, skip sending error response.") + self.llm_logger.warning(f"Request {request_id} has no error, skip sending error response.") continue self._send_error_response(request_id, failed) @@ -1091,7 +1092,7 @@ def _insert_zmq_task_to_scheduler(self): ) def _send_error_response(self, request_id, error_msg, error_code: int = 500): - llm_logger.error( + self.llm_logger.error( f"Send error response to client, request_id: {request_id}, error_msg: {error_msg}, error_code: {error_code}" ) error_result = RequestOutput( @@ -1154,7 +1155,7 @@ def _zmq_send_generated_tokens(self): elif content.finished: new_step_contents.append(content) else: - llm_logger.warning( + self.llm_logger.warning( f"current tokens need to accumulate, req_id: {content.request_id} {content.outputs.token_ids}" ) else: @@ -1184,16 +1185,16 @@ def _zmq_send_generated_tokens(self): elif content.finished: new_contents.append(content) else: - llm_logger.warning( + self.llm_logger.warning( f"current tokens need to accumulate, req_id: {request_id} {content.outputs.token_ids}" ) else: new_contents.append(content) if len(new_contents): - llm_logger.debug(f"Send response for request id: {request_id}") + self.llm_logger.debug(f"Send response for request id: {request_id}") self.send_response_server.send_response(request_id, new_contents) except Exception as e: - llm_logger.error(f"Unexcepted error happend: {e}, {traceback.format_exc()!s}") + self.llm_logger.error(f"Unexcepted error happend: {e}, {traceback.format_exc()!s}") def _decode_process_splitwise_requests(self): """ @@ -1342,15 +1343,15 @@ def check_and_free_block_tables(self): def clear_data(self): try: - llm_logger.info("Clear Data: Start") + self.llm_logger.info("Clear Data: Start") self.token_processor.clear_data() self.engine_worker_queue.clear_data() self.send_response_server.req_dict.clear() self.recv_request_server.req_dict.clear() - llm_logger.info("Clear Data: Successfully") + self.llm_logger.info("Clear Data: Successfully") return True except Exception as e: - llm_logger.error(f"Clear data error: {e}") + self.llm_logger.error(f"Clear data error: {e}") return False def _register_to_router(self): @@ -1376,18 +1377,18 @@ def _register(): ) if resp.ok: - llm_logger.info("Successfully registered to the router!") + self.llm_logger.info("Successfully registered to the router!") break else: - llm_logger.error( + self.llm_logger.error( f"Router registration failed: {resp.status_code}, " f"{resp.text}, {self.cfg.register_info}" ) time.sleep(sleep_seconds) except requests.exceptions.RequestException as e: - llm_logger.error(f"Register to router request error: {e}") + self.llm_logger.error(f"Register to router request error: {e}") except Exception as e: - llm_logger.exception(f"Unexpected error during router registration: {e}") + self.llm_logger.exception(f"Unexpected error during router registration: {e}") if self.cfg.router_config.router is not None: register_thread = threading.Thread(target=_register, daemon=True) @@ -1397,45 +1398,45 @@ def _exit_sub_services(self): """ exit sub services """ - llm_logger.info("Exit sub services.....") + self.llm_logger.info("Exit sub services.....") self.running = False if self.use_async_llm: # Clean up worker processes first (before closing multiprocessing services) if hasattr(self, "worker_proc") and self.worker_proc is not None: - llm_logger.info("Cleaning up worker processes...") + self.llm_logger.info("Cleaning up worker processes...") try: pgid = os.getpgid(self.worker_proc.pid) os.killpg(pgid, signal.SIGTERM) except Exception as e: - llm_logger.error(f"Error extracting sub services: {e}, {str(traceback.format_exc())}") + self.llm_logger.error(f"Error extracting sub services: {e}, {str(traceback.format_exc())}") # Clean up cache manager processes if hasattr(self, "cache_manager_processes"): - llm_logger.info("Cleaning up cache manager processes...") + self.llm_logger.info("Cleaning up cache manager processes...") self.resource_manager.cache_manager.shm_cache_task_flag_broadcast.clear() self.resource_manager.cache_manager.cache_ready_signal.clear() for p in self.cache_manager_processes: - llm_logger.info(f"Killing cache manager process {p.pid}") + self.llm_logger.info(f"Killing cache manager process {p.pid}") try: pgid = os.getpgid(p.pid) os.killpg(pgid, signal.SIGTERM) except Exception as e: - llm_logger.error( + self.llm_logger.error( f"Error killing cache manager process {p.pid}: {e}, {str(traceback.format_exc())}" ) if hasattr(self, "cache_task_queue") and self.cache_task_queue is not None: - llm_logger.info("Cleaning up cache_task_queue...") + self.llm_logger.info("Cleaning up cache_task_queue...") # Check if cleanup method exists if hasattr(self.cache_task_queue, "cleanup"): self.cache_task_queue.cleanup() elif hasattr(self.cache_task_queue, "manager"): try: - llm_logger.info("Shutting down cache_task_queue manager...") + self.llm_logger.info("Shutting down cache_task_queue manager...") self.cache_task_queue.manager.shutdown() except Exception as e: - llm_logger.warning(f"Error shutting down cache_task_queue manager: {e}") + self.llm_logger.warning(f"Error shutting down cache_task_queue manager: {e}") if hasattr(self, "get_profile_block_num_signal"): self.get_profile_block_num_signal.clear() @@ -1446,7 +1447,7 @@ def _exit_sub_services(self): # Clean up other services if hasattr(self, "dp_processed"): for p in self.dp_processed: - llm_logger.info(f"Waiting for worker {p.pid} to exit") + self.llm_logger.info(f"Waiting for worker {p.pid} to exit") p.join() for p in self.dp_engine_worker_queue_server: p.cleanup() @@ -1614,9 +1615,9 @@ def _start_worker_service(self): think_end_id = self.data_processor.tokenizer.get_vocab().get("", -1) if think_end_id > 0: - llm_logger.info(f"Get think_end_id {think_end_id} from vocab.") + self.llm_logger.info(f"Get think_end_id {think_end_id} from vocab.") else: - llm_logger.info("No token found in vocabulary, the model can not do reasoning.") + self.llm_logger.info("No token found in vocabulary, the model can not do reasoning.") image_patch_id = self.data_processor.tokenizer.get_vocab().get("<|IMAGE_PLACEHOLDER|>", -1) line_break_id = self.data_processor.tokenizer.get_vocab().get("\n", -1) @@ -1696,7 +1697,7 @@ def _start_worker_service(self): if self.cfg.nnode > 1: pd_cmd = pd_cmd + f" --ips {ips} --nnodes {len(self.cfg.ips)}" pd_cmd = pd_cmd + arguments + f" 2>{log_dir}/launch_worker.log" - llm_logger.info(f"Launch worker service command: {pd_cmd}") + self.llm_logger.info(f"Launch worker service command: {pd_cmd}") p = subprocess.Popen( pd_cmd, stdout=subprocess.PIPE, @@ -1772,7 +1773,7 @@ def launch_components(self): else: address = f"/dev/shm/fd_task_queue_{self.cfg.parallel_config.engine_worker_queue_port[i]}.sock" - llm_logger.info(f"dp start queue service {address}") + self.llm_logger.info(f"dp start queue service {address}") self.dp_engine_worker_queue_server.append( EngineWorkerQueue( address=address, @@ -1794,7 +1795,7 @@ def launch_components(self): ), ) ) - llm_logger.info( + self.llm_logger.info( f"Engine is initialized successfully with {self.cfg.parallel_config.tensor_parallel_size}" + f" data parallel id {i}" ) diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index a783b2ab791..00868bb224e 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -16,6 +16,7 @@ from __future__ import annotations +import copy import json import multiprocessing import os @@ -365,7 +366,7 @@ def _init_worker_signals(self): ) # launched_expert_service_signal: Used to sense whether each expet_servic is started successfully - if self.cfg.parallel_config.enable_expert_parallel and self.cfg.parallel_config.data_parallel_size > 1: + if self.cfg.parallel_config.data_parallel_size > 1 and not envs.FD_ENABLE_MULTI_API_SERVER: launched_expert_service_signal_data = np.zeros( shape=[self.cfg.parallel_config.data_parallel_size // self.cfg.nnode], dtype=np.int32 ) @@ -743,12 +744,10 @@ def launch_components(self): if not envs.FD_ENGINE_TASK_QUEUE_WITH_SHM: address = ( self.cfg.master_ip, - int(self.cfg.parallel_config.local_engine_worker_queue_port), + int(self.cfg.parallel_config.engine_worker_queue_port[i]), ) else: - address = ( - f"/dev/shm/fd_task_queue_{self.cfg.parallel_config.local_engine_worker_queue_port}.sock" - ) + address = f"/dev/shm/fd_task_queue_{self.cfg.parallel_config.engine_worker_queue_port[i]}.sock" llm_logger.info(f"dp start queue service {address}") self.dp_engine_worker_queue_server.append( @@ -759,12 +758,13 @@ def launch_components(self): local_data_parallel_size=self.cfg.parallel_config.data_parallel_size, ) ) - ctx = multiprocessing.get_context("spawn") + ctx = multiprocessing.get_context("fork") + cfg = copy.deepcopy(self.cfg) self.dp_processed.append( ctx.Process( target=start_data_parallel_service, args=( - self.cfg, + cfg, i, None, request_queues_for_dp_ipc, diff --git a/fastdeploy/engine/expert_service.py b/fastdeploy/engine/expert_service.py index 5eceb487445..cab831ab983 100644 --- a/fastdeploy/engine/expert_service.py +++ b/fastdeploy/engine/expert_service.py @@ -27,7 +27,7 @@ from fastdeploy.engine.common_engine import EngineService from fastdeploy.inter_communicator import IPCSignal -from fastdeploy.utils import console_logger, envs, llm_logger +from fastdeploy.utils import console_logger, envs, get_logger, llm_logger class ExpertService: @@ -48,6 +48,12 @@ def __init__(self, cfg, local_data_parallel_id, start_queue=True): """ self.cfg = cfg + + if self.cfg.parallel_config.data_parallel_size > 1: + self.llm_logger = get_logger("fastdeploy", f"fastdeploy_dprank{local_data_parallel_id}.log") + else: + self.llm_logger = llm_logger + if cfg.scheduler_config.splitwise_role != "mixed": if envs.FD_ENABLE_INTERNAL_ADAPTER: envs.FD_ZMQ_RECV_REQUEST_SERVER_PORT = envs.FD_ZMQ_RECV_REQUEST_SERVER_PORTS.split(",")[ @@ -56,14 +62,25 @@ def __init__(self, cfg, local_data_parallel_id, start_queue=True): envs.FD_ZMQ_SEND_RESPONSE_SERVER_PORT = envs.FD_ZMQ_SEND_RESPONSE_SERVER_PORTS.split(",")[ local_data_parallel_id ] - llm_logger.info(f"local_data_parallel_id: {local_data_parallel_id}") + self.llm_logger.info(f"local_data_parallel_id: {local_data_parallel_id}") if self.cfg.cache_config.num_gpu_blocks_override is None: self.do_profile = True else: self.do_profile = False - self.cfg.parallel_config.local_data_parallel_id = local_data_parallel_id + # Update config for the current dp process + if not envs.FD_ENABLE_MULTI_API_SERVER: + self.cfg.parallel_config.local_data_parallel_id = local_data_parallel_id + self.cfg.postprocess_devices_and_ports() + self.llm_logger.info( + f"Update config for the current dp process: " + f"local_engine_worker_queue_port: {self.cfg.parallel_config.local_engine_worker_queue_port} " + f"local_cache_queue_port: {self.cfg.cache_config.local_cache_queue_port} " + f"local_pd_comm_port: {self.cfg.cache_config.local_pd_comm_port} " + f"local_rdma_comm_ports: {self.cfg.cache_config.local_rdma_comm_ports} " + ) + self.engine = EngineService(self.cfg, start_queue) if self.cfg.scheduler_config.name == "splitwise": self.engine.scheduler.reset_nodeid(f"{self.engine.scheduler.infer.nodeid}_{local_data_parallel_id!s}") @@ -96,7 +113,7 @@ def start( ipc_signal_suffix = self.cfg.parallel_config.engine_worker_queue_port[0] self.engine.start_zmq_service(self.cfg.parallel_config.engine_worker_queue_port[local_data_parallel_id]) - llm_logger.info(f"start expert service {local_data_parallel_id}") + self.llm_logger.info(f"start expert service {local_data_parallel_id}") if self.cfg.scheduler_config.name == "splitwise": self.cfg.init_cache_info() @@ -114,7 +131,7 @@ def start( local_rank = local_data_parallel_id % self.cfg.worker_num_per_node if not envs.FD_ENABLE_MULTI_API_SERVER: - if self.cfg.parallel_config.enable_expert_parallel: + if self.cfg.parallel_config.data_parallel_size > 1: launched_expert_service_signal_data = np.zeros( shape=[self.cfg.parallel_config.data_parallel_size // self.cfg.nnode], dtype=np.int32 ) @@ -126,6 +143,7 @@ def start( create=False, ) self.launched_expert_service_signal.value[local_rank] = 1 + if self.do_profile: get_profile_block_num = np.zeros([1], dtype=np.int32) while True: @@ -143,10 +161,9 @@ def start( self.reset_kvcache_blocks() if self.cfg.scheduler_config.splitwise_role != "mixed" or self.cfg.cache_config.enable_prefix_caching: - ipc_signal_suffix_cache = self.cfg.parallel_config.engine_worker_queue_port[local_data_parallel_id] self.cache_manager_processes = self.engine.start_cache_service( self.cfg.local_device_ids, - ipc_signal_suffix_cache, + self.cfg.parallel_config.local_engine_worker_queue_port, ) console_logger.info( f"Worker processes(rank {local_rank}) are launched with {time.time() - start_time} seconds." @@ -169,7 +186,7 @@ def _exit_sub_services(self): if hasattr(self, "cache_manager_processes"): self.engine.resource_manager.cache_manager.shm_cache_task_flag_broadcast.clear() for p in self.cache_manager_processes: - llm_logger.info(f"Killing cache manager process {p.pid}") + self.llm_logger.info(f"Killing cache manager process {p.pid}") try: os.killpg(p.pid, signal.SIGTERM) except: diff --git a/fastdeploy/splitwise/splitwise_connector.py b/fastdeploy/splitwise/splitwise_connector.py index 8d81c193090..36879e02b81 100644 --- a/fastdeploy/splitwise/splitwise_connector.py +++ b/fastdeploy/splitwise/splitwise_connector.py @@ -71,8 +71,8 @@ def _init_network(self): self.router_socket.setsockopt(zmq.LINGER, 0) self.router_socket.setsockopt(zmq.SNDHWM, 1000) self.router_socket.setsockopt(zmq.ROUTER_MANDATORY, 1) - self.router_socket.bind(f"tcp://*:{self.cfg.cache_config.pd_comm_port}") - self.logger.info(f"_init_network: bind {self.cfg.cache_config.pd_comm_port}") + self.logger.info(f"_init_network: bind {self.cfg.cache_config.local_pd_comm_port}") + self.router_socket.bind(f"tcp://*:{self.cfg.cache_config.local_pd_comm_port}") self.poller = zmq.Poller() self.poller.register(self.router_socket, zmq.POLLIN) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 8bfdd70c8e4..43a79064723 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1654,6 +1654,7 @@ def initialize_kv_cache(self, profile: bool = False) -> None: logger.info(f"✅ kv cache is ready! {cache_ready_signal.value}") paddle.device.cuda.empty_cache() + logger.info("kv cache is initialized!") def _initialize_attn_backend(self) -> None: """ From b3fec44d8223e375b4a0aa90ad76b02513a23159 Mon Sep 17 00:00:00 2001 From: liyonghua0910 Date: Wed, 10 Dec 2025 17:15:05 +0800 Subject: [PATCH 07/17] [fix] fix port for xpu --- fastdeploy/worker/xpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index f9bbb4ea95d..e0d93551abd 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -1010,7 +1010,7 @@ def initialize_kv_cache(self, profile: bool = False) -> None: name="cache_ready_signal", array=cache_ready_signal_data, dtype=np.int32, - suffix=self.parallel_config.engine_worker_queue_port, + suffix=self.parallel_config.local_engine_worker_queue_port, create=False, ) From 8ceb83cea089abb8e916849cb97fa7c82d939c06 Mon Sep 17 00:00:00 2001 From: liyonghua0910 Date: Wed, 10 Dec 2025 21:04:08 +0800 Subject: [PATCH 08/17] [fix] add tests for ports post processing & fix ci --- fastdeploy/config.py | 51 ++++++++++++++++++----------------- tests/utils/test_config.py | 54 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 24 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 78dc5899bf6..00cbfeab883 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -1767,34 +1767,37 @@ def postprocess(self): self.postprocess_devices_and_ports() def postprocess_devices_and_ports(self): - # get devices and ports for current dp - self.local_device_ids = self.parallel_config.device_ids.split(",")[ - self.parallel_config.local_data_parallel_id - * self.parallel_config.tensor_parallel_size : (self.parallel_config.local_data_parallel_id + 1) - * self.parallel_config.tensor_parallel_size - ] - self.parallel_config.local_engine_worker_queue_port = self.parallel_config.engine_worker_queue_port[ - self.parallel_config.local_data_parallel_id - ] - self.cache_config.local_cache_queue_port = ( - self.cache_config.cache_queue_port[self.parallel_config.local_data_parallel_id] - if self.cache_config.cache_queue_port - else None - ) - self.cache_config.local_pd_comm_port = ( - self.cache_config.pd_comm_port[self.parallel_config.local_data_parallel_id] - if self.cache_config.pd_comm_port - else None - ) - self.cache_config.local_rdma_comm_ports = ( - self.cache_config.rdma_comm_ports[ + try: + # get devices and ports for current dp + self.local_device_ids = self.parallel_config.device_ids.split(",")[ self.parallel_config.local_data_parallel_id * self.parallel_config.tensor_parallel_size : (self.parallel_config.local_data_parallel_id + 1) * self.parallel_config.tensor_parallel_size ] - if self.cache_config.rdma_comm_ports - else None - ) + self.parallel_config.local_engine_worker_queue_port = self.parallel_config.engine_worker_queue_port[ + self.parallel_config.local_data_parallel_id + ] + self.cache_config.local_cache_queue_port = ( + self.cache_config.cache_queue_port[self.parallel_config.local_data_parallel_id] + if self.cache_config.cache_queue_port + else None + ) + self.cache_config.local_pd_comm_port = ( + self.cache_config.pd_comm_port[self.parallel_config.local_data_parallel_id] + if self.cache_config.pd_comm_port + else None + ) + self.cache_config.local_rdma_comm_ports = ( + self.cache_config.rdma_comm_ports[ + self.parallel_config.local_data_parallel_id + * self.parallel_config.tensor_parallel_size : (self.parallel_config.local_data_parallel_id + 1) + * self.parallel_config.tensor_parallel_size + ] + if self.cache_config.rdma_comm_ports + else None + ) + except Exception as e: + logger.error(f"Failed to extract local devices or ports. Servers may not be able to start properly. {e}") def check(self): """ diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py index 7638e465b75..326f1251faa 100644 --- a/tests/utils/test_config.py +++ b/tests/utils/test_config.py @@ -14,6 +14,7 @@ # limitations under the License. """ +import random import unittest from unittest.mock import Mock @@ -132,6 +133,59 @@ def test_fdconfig_init_cache(self): fd_config.init_cache_info() assert fd_config.register_info is not None + def test_fdconfig_postprocess_ports(self): + data_parallel_size = 4 + tensor_parallel_size = 2 + local_data_parallel_id = random.randint(0, data_parallel_size - 1) + engine_worker_queue_ports = [random.randint(8000, 65535) for _ in range(data_parallel_size)] + cache_queue_ports = [random.randint(8000, 65535) for _ in range(data_parallel_size)] + pd_comm_ports = [random.randint(8000, 65535) for _ in range(data_parallel_size)] + rdma_comm_ports = [random.randint(8000, 65535) for _ in range(data_parallel_size * tensor_parallel_size)] + + parallel_config = ParallelConfig( + { + "engine_worker_queue_port": ",".join(map(str, engine_worker_queue_ports)), + "data_parallel_size": data_parallel_size, + "tensor_parallel_size": tensor_parallel_size, + "local_data_parallel_id": local_data_parallel_id, + } + ) + graph_opt_config = GraphOptimizationConfig({}) + cache_config = CacheConfig( + { + "cache_queue_port": ",".join(map(str, cache_queue_ports)), + "pd_comm_port": ",".join(map(str, pd_comm_ports)), + "rdma_comm_ports": ",".join(map(str, rdma_comm_ports)), + } + ) + load_config = LoadConfig({}) + scheduler_config = SchedulerConfig({}) + model_config: Mock = Mock() + model_config.max_model_len = 512 + + fd_config = FDConfig( + parallel_config=parallel_config, + graph_opt_config=graph_opt_config, + cache_config=cache_config, + load_config=load_config, + scheduler_config=scheduler_config, + model_config=model_config, + ips="0.0.0.0", + test_mode=True, + ) + assert ( + fd_config.parallel_config.local_engine_worker_queue_port + == engine_worker_queue_ports[local_data_parallel_id] + ) + assert fd_config.cache_config.local_cache_queue_port == cache_queue_ports[local_data_parallel_id] + assert fd_config.cache_config.local_pd_comm_port == pd_comm_ports[local_data_parallel_id] + assert ( + fd_config.cache_config.local_rdma_comm_ports + == rdma_comm_ports[ + local_data_parallel_id * tensor_parallel_size : (local_data_parallel_id + 1) * tensor_parallel_size + ] + ) + if __name__ == "__main__": unittest.main() From c2bc5d6ae92fac03dfd43b2f25812aeab95c476e Mon Sep 17 00:00:00 2001 From: liyonghua0910 Date: Thu, 11 Dec 2025 12:29:38 +0800 Subject: [PATCH 09/17] [test] fix test_multi_api_server --- .../entrypoints/openai/multi_api_server.py | 37 ++++----- .../openai/test_multi_api_server.py | 78 ++++++++++++------- 2 files changed, 68 insertions(+), 47 deletions(-) diff --git a/fastdeploy/entrypoints/openai/multi_api_server.py b/fastdeploy/entrypoints/openai/multi_api_server.py index 40e268e9d2a..54cff31faff 100644 --- a/fastdeploy/entrypoints/openai/multi_api_server.py +++ b/fastdeploy/entrypoints/openai/multi_api_server.py @@ -34,7 +34,22 @@ def start_servers( metrics_ports=None, controller_ports=None, ): - processes = [] + ports = ports.split(",") + if not check_param(ports, server_count): + return + + if metrics_ports != "-1": + metrics_ports = metrics_ports.split(",") + if not check_param(metrics_ports, server_count): + return + + if controller_ports != "-1": + controller_ports = controller_ports.split(",") + if not check_param(controller_ports, server_count): + return + else: + controller_ports = [-1] * server_count + logger.info(f"Starting servers on ports: {ports} with args: {server_args} and metrics ports: {metrics_ports}") port_idx = {} for i in range(len(server_args)): @@ -83,22 +98,8 @@ def start_servers( if not check_param(rdma_comm_port, device_count): return - if not check_param(ports, server_count): - return - - if metrics_ports != "-1": - metrics_ports = metrics_ports.split(",") - if not check_param(metrics_ports, server_count): - return - - if controller_ports != "-1": - controller_ports = controller_ports.split(",") - if not check_param(controller_ports, server_count): - return - else: - controller_ports = [-1] * server_count - logger.info(f"Modified server_args: {server_args}") + processes = [] for i in range(server_count): port = int(ports[i]) controller_port = int(controller_ports[i]) @@ -118,7 +119,7 @@ def start_servers( str(i), ] if metrics_ports != "-1": - cmd += [metrics_ports[i], "--controller-port"] + cmd += ["--metrics-port", metrics_ports[i]] # 启动子进程 proc = subprocess.Popen(cmd, env=env) @@ -159,7 +160,7 @@ def main(): server_count=args.num_servers, device_count=device_count, server_args=args.args, - ports=args.ports.split(","), + ports=args.ports, metrics_ports=args.metrics_ports, controller_ports=args.controller_ports, ) diff --git a/tests/entrypoints/openai/test_multi_api_server.py b/tests/entrypoints/openai/test_multi_api_server.py index f1623d26add..a17f306acda 100644 --- a/tests/entrypoints/openai/test_multi_api_server.py +++ b/tests/entrypoints/openai/test_multi_api_server.py @@ -14,7 +14,8 @@ # limitations under the License. """ -import sys +import os +import random import unittest from unittest.mock import MagicMock, patch @@ -30,10 +31,26 @@ class TestMultiApiServer(unittest.TestCase): def setUp(self): """Set up test fixtures""" - self.test_ports = ["8000", "8001"] - self.test_metrics_ports = ["8800", "8801"] - self.test_server_args = ["--model", "test_model", "--engine-worker-queue-port", "9000,9001"] + self.test_model = "test_model" + self.test_ports = "8000,8001" + self.test_metrics_ports = "8800,8801" + self.test_engine_worker_queue_port = "9000,9001" + self.test_server_args = [ + "--model", + self.test_model, + "--engine-worker-queue-port", + self.test_engine_worker_queue_port, + ] self.test_server_count = 2 + self.test_device_count = 2 + + patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0,1"}).start() + patch( + "fastdeploy.entrypoints.openai.multi_api_server.find_free_ports", + side_effect=lambda *args, **kwargs: [ + random.randint(8000, 65535) for i in range(kwargs.get("num_ports", 1)) + ], + ).start() @patch("fastdeploy.entrypoints.openai.multi_api_server.subprocess.Popen") @patch("fastdeploy.entrypoints.openai.multi_api_server.is_port_available") @@ -49,6 +66,7 @@ def test_start_servers_success(self, mock_is_port_available, mock_popen): # Call start_servers processes = start_servers( server_count=self.test_server_count, + device_count=self.test_device_count, server_args=self.test_server_args, ports=self.test_ports, metrics_ports=self.test_metrics_ports, @@ -63,24 +81,20 @@ def test_start_servers_success(self, mock_is_port_available, mock_popen): # Verify the command arguments for the first server first_call_args = mock_popen.call_args_list[0][0][0] - expected_cmd = [ - sys.executable, - "-m", - "fastdeploy.entrypoints.openai.api_server", - "--model", - "test_model", - "--engine-worker-queue-port", - "9000,9001", - "--port", - "8000", - "--metrics-port", - "8800", - "--controller-port", - "-1", - "--local-data-parallel-id", - "0", - ] - self.assertEqual(first_call_args, expected_cmd) + print(first_call_args) + for i, item in enumerate(first_call_args): + if item == "--port": + self.assertEqual(first_call_args[i + 1], self.test_ports.split(",")[0]) + if item == "--metrics-port": + self.assertEqual(first_call_args[i + 1], self.test_metrics_ports.split(",")[0]) + if item == "--controller-port": + self.assertEqual(first_call_args[i + 1], "-1") + if item == "--model": + self.assertEqual(first_call_args[i + 1], self.test_model) + if item == "--engine-worker-queue-port": + self.assertEqual(first_call_args[i + 1], self.test_engine_worker_queue_port) + if item == "--local-data-parallel-id": + self.assertEqual(first_call_args[i + 1], "0") # Verify environment variables are set correctly first_call_kwargs = mock_popen.call_args_list[0][1] @@ -94,7 +108,7 @@ def test_check_param_success(self, mock_is_port_available): mock_is_port_available.return_value = True # Should not raise any exception - check_param(self.test_ports, self.test_server_count) + check_param(self.test_ports.split(","), self.test_server_count) def test_check_param_wrong_port_count(self): """Test parameter validation with wrong port count""" @@ -108,12 +122,13 @@ def test_check_param_port_in_use(self, mock_is_port_available): # Mock port availability check - first port available, second not mock_is_port_available.side_effect = [True, False] - self.assertFalse(check_param(self.test_ports, self.test_server_count)) + self.assertFalse(check_param(self.test_ports.split(","), self.test_server_count)) + @patch("fastdeploy.entrypoints.openai.multi_api_server.is_port_available") @patch("fastdeploy.entrypoints.openai.multi_api_server.start_servers") @patch("fastdeploy.entrypoints.openai.multi_api_server.time.sleep") @patch("fastdeploy.entrypoints.openai.multi_api_server.check_param") - def test_main_function(self, mock_check_param, mock_sleep, mock_start_servers): + def test_main_function(self, mock_check_param, mock_sleep, mock_start_servers, mock_is_port_available): """Test main function with mocked arguments""" # Mock command line arguments test_args = [ @@ -133,6 +148,9 @@ def test_main_function(self, mock_check_param, mock_sleep, mock_start_servers): "9000,9001", ] + # Mock utilization functions + mock_is_port_available.return_value = True + # Mock processes mock_proc1 = MagicMock() mock_proc2 = MagicMock() @@ -144,12 +162,14 @@ def test_main_function(self, mock_check_param, mock_sleep, mock_start_servers): with patch("sys.argv", test_args): main() + print(mock_start_servers) # Verify start_servers was called with correct parameters mock_start_servers.assert_called_once_with( - server_count=2, - server_args=["--model", "test_model", "--engine-worker-queue-port", "9000,9001"], - ports=["8000", "8001"], - metrics_ports=["8800", "8801"], + server_count=self.test_server_count, + device_count=self.test_device_count, + server_args=self.test_server_args, + ports=self.test_ports, + metrics_ports=self.test_metrics_ports, controller_ports="8802,8803", ) From 32fa9e8aa90ce4eeb502687d29cf704cae82b4a9 Mon Sep 17 00:00:00 2001 From: liyonghua0910 Date: Thu, 11 Dec 2025 12:33:34 +0800 Subject: [PATCH 10/17] [fix] fix rdma_comm_ports args for multi_api_server --- .../entrypoints/openai/multi_api_server.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fastdeploy/entrypoints/openai/multi_api_server.py b/fastdeploy/entrypoints/openai/multi_api_server.py index 54cff31faff..20f3ffbea94 100644 --- a/fastdeploy/entrypoints/openai/multi_api_server.py +++ b/fastdeploy/entrypoints/openai/multi_api_server.py @@ -59,8 +59,8 @@ def start_servers( port_idx["cache_queue_port"] = i + 1 if server_args[i] == "--pd-comm-port": port_idx["pd_comm_port"] = i + 1 - if server_args[i] == "--rdma-comm-port": - port_idx["rdma_comm_port"] = i + 1 + if server_args[i] == "--rdma-comm-ports": + port_idx["rdma_comm_ports"] = i + 1 if "engine_worker_queue_port" not in port_idx: port = find_free_ports(num_ports=server_count) @@ -89,13 +89,13 @@ def start_servers( if not check_param(pd_comm_port, server_count): return - if "rdma_comm_port" not in port_idx: + if "rdma_comm_ports" not in port_idx: port = find_free_ports(num_ports=device_count) - server_args += ["--rdma-comm-port", ",".join(map(str, port))] - port_idx["rdma_comm_port"] = len(server_args) - 1 - logger.info(f"No --rdma-comm-port specified, using random ports: {port}") - rdma_comm_port = server_args[port_idx["rdma_comm_port"]].split(",") - if not check_param(rdma_comm_port, device_count): + server_args += ["--rdma-comm-ports", ",".join(map(str, port))] + port_idx["rdma_comm_ports"] = len(server_args) - 1 + logger.info(f"No --rdma-comm-ports specified, using random ports: {port}") + rdma_comm_ports = server_args[port_idx["rdma_comm_ports"]].split(",") + if not check_param(rdma_comm_ports, device_count): return logger.info(f"Modified server_args: {server_args}") From d1ab65cfd9b8e509d0dffc356531541261e809b4 Mon Sep 17 00:00:00 2001 From: liyonghua0910 Date: Thu, 11 Dec 2025 16:00:13 +0800 Subject: [PATCH 11/17] [fix] fix test_common_engine --- fastdeploy/config.py | 2 +- fastdeploy/engine/common_engine.py | 2 +- fastdeploy/engine/engine.py | 1 + tests/engine/test_common_engine.py | 27 +++++++++++++++++---------- 4 files changed, 20 insertions(+), 12 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 00cbfeab883..edf86389d12 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -1650,7 +1650,7 @@ def __init__( if test_mode: return self.check() - self.print() + # self.print() # NOTE: it's better to explicitly call .print() when FDConfig is initialized def _disable_sequence_parallel_moe_if_needed(self, mode_name): if self.parallel_config.use_sequence_parallel_moe and self.graph_opt_config.use_cudagraph: diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py index 33d9abe6077..9aa657f23a0 100644 --- a/fastdeploy/engine/common_engine.py +++ b/fastdeploy/engine/common_engine.py @@ -1621,7 +1621,7 @@ def _start_worker_service(self): image_patch_id = self.data_processor.tokenizer.get_vocab().get("<|IMAGE_PLACEHOLDER|>", -1) line_break_id = self.data_processor.tokenizer.get_vocab().get("\n", -1) - ports = ",".join(self.cfg.parallel_config.engine_worker_queue_port) + ports = ",".join(map(str, self.cfg.parallel_config.engine_worker_queue_port)) ips = None if self.cfg.ips is not None: ips = ",".join(self.cfg.ips) diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index 00868bb224e..7c5e8a3223a 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -85,6 +85,7 @@ def __init__(self, cfg): cfg (Config): Config object containing all the configuration parameters. """ self.cfg = cfg + self.cfg.print() self.running = True self.is_started = False diff --git a/tests/engine/test_common_engine.py b/tests/engine/test_common_engine.py index a3ed0270e32..673632ee0d0 100644 --- a/tests/engine/test_common_engine.py +++ b/tests/engine/test_common_engine.py @@ -200,14 +200,26 @@ class TestCommonEngineAdditionalCoverage(unittest.TestCase): and to drive specific code paths that were previously uncovered. """ + def setUp(self): + patch("fastdeploy.engine.common_engine.EngineCacheQueue").start() + def _make_cfg(self, **kwargs): + # If DP > 1, we must provide enough engine_worker_queue_port for each dp index + dp = kwargs.get("data_parallel_size", 1) + nnode = len(kwargs.get("ips", ["127.0.0.1"])) + engine_worker_queue_port = int(os.getenv("FD_ENGINE_QUEUE_PORT", "6778")) + cache_queue_port = int(os.getenv("FD_CACHE_QUEUE_PORT", "6779")) + if dp and dp > 1: + engine_worker_queue_port = [engine_worker_queue_port + 20 + i for i in range(dp // nnode)] + cache_queue_port = [cache_queue_port + 20 + i for i in range(dp // nnode)] + args = EngineArgs( model=MODEL_NAME, max_model_len=128, tensor_parallel_size=1, # give unique ports to avoid collision with other tests - engine_worker_queue_port=str(int(os.getenv("FD_ENGINE_QUEUE_PORT", "6778")) + 20), - cache_queue_port=str(int(os.getenv("FD_CACHE_QUEUE_PORT", "6779")) + 20), + engine_worker_queue_port=engine_worker_queue_port, + cache_queue_port=cache_queue_port, enable_prefix_caching=True, **kwargs, ) @@ -218,13 +230,6 @@ def _make_cfg(self, **kwargs): # Always enable chunked prefill in tests to avoid another strict check args.enable_chunked_prefill = True - # If DP > 1, we must provide enough engine_worker_queue_port for each dp index - dp = kwargs.get("data_parallel_size", args.data_parallel_size) - base = int(args.engine_worker_queue_port.split(",")[0]) - if dp and dp > 1: - ports = ",".join(str(base + i) for i in range(dp)) - args.engine_worker_queue_port = ports - return args.create_engine_config(port_availability_check=False) def _stub_processor(self): @@ -573,7 +578,9 @@ def __init__(self, *a, **k): def test_start_worker_service_cmd_build(self): """Cover 1517, 1526, 1568, 1592, 1595 by building the worker command with mocks.""" with patch("fastdeploy.config.get_host_ip", return_value="127.0.0.1"): - cfg = self._make_cfg(splitwise_role="mixed", num_gpu_blocks_override=4, ips=["127.0.0.1", "127.0.0.2"]) + cfg = self._make_cfg( + splitwise_role="mixed", num_gpu_blocks_override=4, ips=["127.0.0.1", "127.0.0.2"], data_parallel_size=2 + ) # Make model multi-modal so env var branch already covered above; here not required cfg.structured_outputs_config.logits_processors = ["A", "B"] From 1832afbd0ec2dabf383702ae944810c757a73195 Mon Sep 17 00:00:00 2001 From: liyonghua0910 Date: Thu, 11 Dec 2025 16:16:49 +0800 Subject: [PATCH 12/17] [fix] fix test_cache_transfer_manager --- tests/cache_manager/test_cache_transfer_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/cache_manager/test_cache_transfer_manager.py b/tests/cache_manager/test_cache_transfer_manager.py index f09fc603325..7dff66d0bed 100644 --- a/tests/cache_manager/test_cache_transfer_manager.py +++ b/tests/cache_manager/test_cache_transfer_manager.py @@ -15,7 +15,7 @@ class Args: mp_num = 1 device_id = 0 speculative_config = {} - engine_pid = "test_pid" + ipc_suffix = "test_ipc_suffix" cache_queue_port = 9999 pod_ip = "127.0.0.1" engine_worker_queue_port = 9998 From d520838c8ad7e6a357548cc95d2bde75c428b8ca Mon Sep 17 00:00:00 2001 From: liyonghua0910 Date: Thu, 11 Dec 2025 16:17:15 +0800 Subject: [PATCH 13/17] [chore] automatically setting FD_ENABLE_MULTI_API_SERVER --- fastdeploy/entrypoints/openai/multi_api_server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fastdeploy/entrypoints/openai/multi_api_server.py b/fastdeploy/entrypoints/openai/multi_api_server.py index 20f3ffbea94..5e64228382c 100644 --- a/fastdeploy/entrypoints/openai/multi_api_server.py +++ b/fastdeploy/entrypoints/openai/multi_api_server.py @@ -105,6 +105,7 @@ def start_servers( controller_port = int(controller_ports[i]) env = os.environ.copy() + env["FD_ENABLE_MULTI_API_SERVER"] = "1" env["FD_LOG_DIR"] = env.get("FD_LOG_DIR", "log") + f"/log_{i}" cmd = [ sys.executable, From 624d02faa97771e270fc6c1c54a6aacd51fac5cb Mon Sep 17 00:00:00 2001 From: liyonghua0910 Date: Fri, 12 Dec 2025 15:16:20 +0800 Subject: [PATCH 14/17] [fix] avoid api server from creating engine_args twice --- fastdeploy/engine/args_utils.py | 9 +++++---- fastdeploy/entrypoints/openai/api_server.py | 7 ++++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 7f5a3ca6de4..1426a8c27ac 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -43,11 +43,12 @@ ) from fastdeploy.platforms import current_platform from fastdeploy.scheduler.config import SchedulerConfig -from fastdeploy.utils import ( # is_port_available, +from fastdeploy.utils import ( DeprecatedOptionWarning, FlexibleArgumentParser, console_logger, find_free_ports, + is_port_available, parse_ports, parse_quantization, ) @@ -552,9 +553,9 @@ def post_init_ports(name: str, ports: list, num_total_ports: int): else: assert ( len(ports) == num_total_ports - ), f"Parameter `{name}` should have {num_total_ports} ports, got {len(ports)}." - # for port in ports: - # assert is_port_available("0.0.0.0", port), f"Parameter `{name}`:{port} is already in use." + ), f"Parameter `{name}` expects {num_total_ports} ports, got {len(ports)}." + for port in ports: + assert is_port_available("0.0.0.0", port), f"Parameter `{name}`:{port} is already in use." console_logger.debug(f"post init {name}: {ports}") return ports diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py index 8da77548951..3b4a497b91d 100644 --- a/fastdeploy/entrypoints/openai/api_server.py +++ b/fastdeploy/entrypoints/openai/api_server.py @@ -84,6 +84,7 @@ if args.tool_parser_plugin: ToolParserManager.import_tool_parser(args.tool_parser_plugin) llm_engine = None +engine_args = None MAX_CONCURRENT_CONNECTIONS = (args.max_concurrency + args.workers - 1) // args.workers connection_semaphore = StatefulSemaphore(MAX_CONCURRENT_CONNECTIONS) @@ -108,7 +109,7 @@ def load_engine(): """ load engine """ - global llm_engine + global engine_args, llm_engine if llm_engine is not None: return llm_engine @@ -127,7 +128,7 @@ def load_data_service(): """ load data service """ - global llm_engine + global engine_args, llm_engine if llm_engine is not None: return llm_engine api_server_logger.info(f"FastDeploy LLM API server starting... {os.getpid()}, port: {args.port}") @@ -147,6 +148,7 @@ async def lifespan(app: FastAPI): """ async context manager for FastAPI lifespan """ + global engine_args import logging uvicorn_access = logging.getLogger("uvicorn.access") @@ -173,7 +175,6 @@ async def lifespan(app: FastAPI): verification = False model_paths = [ModelPath(name=served_model_names, model_path=args.model, verification=verification)] - engine_args = EngineArgs.from_cli_args(args) fd_config = engine_args.create_engine_config(port_availability_check=False) engine_client = EngineClient( pid=pid, From d164731681405bb0bd65889d1a364185d52fe0ab Mon Sep 17 00:00:00 2001 From: liyonghua0910 Date: Fri, 12 Dec 2025 17:56:54 +0800 Subject: [PATCH 15/17] [fix] fix test_run_batch --- fastdeploy/engine/args_utils.py | 32 +++++++++++++++------ fastdeploy/entrypoints/openai/api_server.py | 8 +++--- fastdeploy/entrypoints/openai/run_batch.py | 6 ++-- tests/engine/test_common_engine.py | 2 +- 4 files changed, 31 insertions(+), 17 deletions(-) diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 1426a8c27ac..b7d95d2c944 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -499,6 +499,11 @@ class EngineArgs: Flag to rollout routing replay(r3) """ + skip_port_check: bool = False + """ + Whether to skip port availability check. Default is False (not skip). + """ + def __post_init__(self): """ Post-initialization processing to set default tokenizer if not provided. @@ -549,13 +554,22 @@ def post_init_ports(name: str, ports: list, num_total_ports: int): num_cur_dp_ports //= self.data_parallel_size if ports is None: ports = find_free_ports(num_ports=num_cur_dp_ports) - console_logger.info(f"Parameter `{name}` is not specified, found available ports for use: {ports}") + console_logger.info( + f"Parameter `{name}` is not specified, found available ports for possible use: {ports}" + ) else: - assert ( - len(ports) == num_total_ports - ), f"Parameter `{name}` expects {num_total_ports} ports, got {len(ports)}." - for port in ports: - assert is_port_available("0.0.0.0", port), f"Parameter `{name}`:{port} is already in use." + num_input_ports = len(ports) + if num_input_ports != num_total_ports: + ports = find_free_ports(num_ports=num_cur_dp_ports) + console_logger.warn( + f"Parameter `{name}` expects {num_total_ports} ports, but got {num_input_ports}. Ignore them and assign new ones: {ports}" + ) + else: + console_logger.info(f"Using `{name}`: {ports}") + + if not self.skip_port_check: + for port in ports: + assert is_port_available("0.0.0.0", port), f"Parameter `{name}`:{port} is already in use." console_logger.debug(f"post init {name}: {ports}") return ports @@ -1195,7 +1209,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: return parser @classmethod - def from_cli_args(cls, args: FlexibleArgumentParser) -> "EngineArgs": + def from_cli_args(cls, args: FlexibleArgumentParser, skip_port_check=False) -> "EngineArgs": """ Create an instance of EngineArgs from command line arguments. """ @@ -1203,7 +1217,7 @@ def from_cli_args(cls, args: FlexibleArgumentParser) -> "EngineArgs": for field in dataclass_fields(cls): if hasattr(args, field.name): args_dict[field.name] = getattr(args, field.name) - return cls(**args_dict) + return cls(**args_dict, skip_port_check=skip_port_check) def create_speculative_config(self) -> SpeculativeConfig: """ """ @@ -1282,7 +1296,7 @@ def create_routing_repaly_config(self) -> RoutingReplayConfig: routing_replay_args[k] = v return RoutingReplayConfig(routing_replay_args) - def create_engine_config(self, port_availability_check=True) -> FDConfig: + def create_engine_config(self) -> FDConfig: """ Create and return a Config object based on the current settings. """ diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py index 3b4a497b91d..93fd30932cc 100644 --- a/fastdeploy/entrypoints/openai/api_server.py +++ b/fastdeploy/entrypoints/openai/api_server.py @@ -84,7 +84,6 @@ if args.tool_parser_plugin: ToolParserManager.import_tool_parser(args.tool_parser_plugin) llm_engine = None -engine_args = None MAX_CONCURRENT_CONNECTIONS = (args.max_concurrency + args.workers - 1) // args.workers connection_semaphore = StatefulSemaphore(MAX_CONCURRENT_CONNECTIONS) @@ -109,7 +108,7 @@ def load_engine(): """ load engine """ - global engine_args, llm_engine + global llm_engine if llm_engine is not None: return llm_engine @@ -128,7 +127,7 @@ def load_data_service(): """ load data service """ - global engine_args, llm_engine + global llm_engine if llm_engine is not None: return llm_engine api_server_logger.info(f"FastDeploy LLM API server starting... {os.getpid()}, port: {args.port}") @@ -175,7 +174,8 @@ async def lifespan(app: FastAPI): verification = False model_paths = [ModelPath(name=served_model_names, model_path=args.model, verification=verification)] - fd_config = engine_args.create_engine_config(port_availability_check=False) + engine_args = EngineArgs.from_cli_args(args, skip_port_check=True) + fd_config = engine_args.create_engine_config() engine_client = EngineClient( pid=pid, port=int(os.environ.get("INFERENCE_MSG_QUEUE_ID", "0")), diff --git a/fastdeploy/entrypoints/openai/run_batch.py b/fastdeploy/entrypoints/openai/run_batch.py index 267dac586d3..67766ff67e8 100644 --- a/fastdeploy/entrypoints/openai/run_batch.py +++ b/fastdeploy/entrypoints/openai/run_batch.py @@ -351,8 +351,8 @@ def create_model_paths(args: Namespace) -> List[ModelPath]: async def initialize_engine_client(args: Namespace, pid: int) -> EngineClient: """Initialize and configure the engine client.""" - engine_args = EngineArgs.from_cli_args(args) - fd_config = engine_args.create_engine_config(port_availability_check=False) + engine_args = EngineArgs.from_cli_args(args, skip_port_check=True) + fd_config = engine_args.create_engine_config() engine_client = EngineClient( pid=pid, port=int(args.engine_worker_queue_port[args.local_data_parallel_id]), @@ -485,7 +485,7 @@ async def main(args: argparse.Namespace): try: if args.workers is None: args.workers = max(min(int(args.max_num_seqs // 32), 8), 1) - + console_logger.info(f"Workers: {args.workers}") args.model = retrive_model_from_server(args.model, args.revision) if args.tool_parser_plugin: diff --git a/tests/engine/test_common_engine.py b/tests/engine/test_common_engine.py index 673632ee0d0..82cf15eb2f3 100644 --- a/tests/engine/test_common_engine.py +++ b/tests/engine/test_common_engine.py @@ -230,7 +230,7 @@ def _make_cfg(self, **kwargs): # Always enable chunked prefill in tests to avoid another strict check args.enable_chunked_prefill = True - return args.create_engine_config(port_availability_check=False) + return args.create_engine_config() def _stub_processor(self): class _Tok: From 1fb86eb4e59d546720ffad2596c68bfe27623997 Mon Sep 17 00:00:00 2001 From: liyonghua0910 Date: Fri, 12 Dec 2025 18:18:34 +0800 Subject: [PATCH 16/17] [fix] fix test_metrics --- tests/ci_use/metrics/test_metrics.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/ci_use/metrics/test_metrics.py b/tests/ci_use/metrics/test_metrics.py index 11f4e31e344..21a78606be2 100644 --- a/tests/ci_use/metrics/test_metrics.py +++ b/tests/ci_use/metrics/test_metrics.py @@ -14,6 +14,7 @@ import asyncio import os +import re import shutil import signal import subprocess @@ -47,7 +48,7 @@ def setup_and_run_server(): - Tears down server after all tests finish """ print("Pre-test port cleanup...") - FD_CONTROLLER_PORT = int(os.getenv("FD_CONTROLLER_PORT", 8333)) + FD_CONTROLLER_PORT = int(os.getenv("FD_CONTROLLER_PORT", 8633)) clean_ports([FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT, FD_CACHE_QUEUE_PORT, FD_CONTROLLER_PORT]) env = os.environ.copy() @@ -174,10 +175,9 @@ def parse_prometheus_to_dict(metrics_text: str): value = float(line.split("}")[1].strip()) # 解析 labels - labels = {} - for kv in labels_str.split(","): - k, v = kv.split("=") - labels[k] = v.strip('"') + # 用正则取出所有 key 和 value(去掉外层引号) + pairs = re.findall(r'(\w+)="([^"]*)"', labels_str) + labels = {k: v for k, v in pairs} # 存储 if metric_name not in result: @@ -214,7 +214,7 @@ def test_metrics_with_clear_and_reset(): """ Test the metrics monitoring endpoint. """ - FD_CONTROLLER_PORT = int(os.getenv("FD_CONTROLLER_PORT", 8333)) + FD_CONTROLLER_PORT = int(os.getenv("FD_CONTROLLER_PORT", 8633)) metrics_url = f"http://0.0.0.0:{FD_METRICS_PORT}/metrics" async_concurrency(n=10) From 7d4ff93aa5f5af274b1ebd67dc9c6c451d595fb7 Mon Sep 17 00:00:00 2001 From: liyonghua0910 Date: Fri, 12 Dec 2025 19:43:22 +0800 Subject: [PATCH 17/17] [fix] fix splitwise connector init --- fastdeploy/splitwise/splitwise_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastdeploy/splitwise/splitwise_connector.py b/fastdeploy/splitwise/splitwise_connector.py index 36879e02b81..870a1a9d973 100644 --- a/fastdeploy/splitwise/splitwise_connector.py +++ b/fastdeploy/splitwise/splitwise_connector.py @@ -55,7 +55,7 @@ def __init__(self, cfg, worker_queue, resource_manager): self.current_request_ids = dict() self.enable_decode_cache_task = envs.FD_ENABLE_CACHE_TASK == "1" - if self.cfg.cache_config.pd_comm_port is not None: + if self.cfg.scheduler_config.splitwise_role != "mixed": self.zmq_ctx = zmq.Context() self.push_sockets: Dict[str, zmq.Socket] = {} self.pull_socket = None