From 9027b5752215235bcce35f039da5f87f27d8e905 Mon Sep 17 00:00:00 2001 From: Michael Choi Date: Fri, 19 Sep 2025 22:12:52 +0000 Subject: [PATCH 1/5] updates to support vllm 0.10.2 --- .../inference/vllm/vllm_batch.py | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/model-engine/model_engine_server/inference/vllm/vllm_batch.py b/model-engine/model_engine_server/inference/vllm/vllm_batch.py index b2d9ebbd..3289f313 100644 --- a/model-engine/model_engine_server/inference/vllm/vllm_batch.py +++ b/model-engine/model_engine_server/inference/vllm/vllm_batch.py @@ -53,7 +53,7 @@ from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest, ErrorResponse from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion -from vllm.entrypoints.openai.serving_engine import BaseModelPath +from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels from vllm.utils import merge_async_iterators CONFIG_FILE = os.getenv("CONFIG_FILE") @@ -202,7 +202,7 @@ def determine_max_concurrent_requests( # anecdotally, we're seeing the engine able to handle around 7req/s (for outlines), so set to 30 * 7 ~= 200 if any( request.to_sampling_params( - default_max_tokens=1, logits_processor_pattern=None + max_tokens=1, logits_processor_pattern=None, default_sampling_params={} ).guided_decoding for request in requests ): @@ -294,7 +294,6 @@ async def init_engine( os.environ.get("NUM_INSTANCES", 1) ), # TODO maybe do something other than TP=8, PP=number of nodes seed=request.model_cfg.seed or 0, - disable_log_requests=True, gpu_memory_utilization=request.max_gpu_memory_utilization or 0.9, ) default_engine_args_dict.update(engine_args_dict) @@ -304,15 +303,22 @@ async def init_engine( engine_client = AsyncLLMEngine.from_engine_args(engine_args) model_config = await engine_client.get_model_config() resolved_chat_template = load_chat_template(parsed_configs.chat_template) + parsed_configs + base_model_paths = [BaseModelPath(name=served_model_name, model_path=model_id)] + openai_serving_models = OpenAIServingModels( + engine_client=engine_client, + model_config=model_config, + base_model_paths=base_model_paths, + ) + await openai_serving_models.init_static_loras() + openai_serving_chat = OpenAIServingChat( engine_client, model_config, - base_model_paths, + openai_serving_models, response_role=request.model_cfg.response_role or "assistant", - lora_modules=None, - prompt_adapters=None, request_logger=None, chat_template=resolved_chat_template, chat_template_content_format=None, @@ -321,9 +327,7 @@ async def init_engine( openai_serving_completion = OpenAIServingCompletion( engine_client, model_config, - base_model_paths, - lora_modules=None, - prompt_adapters=None, + openai_serving_models, request_logger=None, ) From ded3e94971465187d2a5ffcc46d91c6f837f72d9 Mon Sep 17 00:00:00 2001 From: Michael Choi Date: Fri, 19 Sep 2025 22:16:20 +0000 Subject: [PATCH 2/5] bump to 0.10.2 --- model-engine/model_engine_server/inference/vllm/Dockerfile.vllm | 2 +- .../inference/vllm/build_and_upload_image.sh | 2 +- .../model_engine_server/inference/vllm/requirements-dev.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/model-engine/model_engine_server/inference/vllm/Dockerfile.vllm b/model-engine/model_engine_server/inference/vllm/Dockerfile.vllm index c4162cf6..98cf4935 100644 --- a/model-engine/model_engine_server/inference/vllm/Dockerfile.vllm +++ b/model-engine/model_engine_server/inference/vllm/Dockerfile.vllm @@ -1,5 +1,5 @@ # syntax=docker/dockerfile:1 -ARG VLLM_VERSION=0.10.1.1 +ARG VLLM_VERSION=0.10.2 ARG VLLM_BASE_REPO=vllm/vllm-openai ARG VLLM_BASE_IMAGE=${VLLM_BASE_REPO}:v${VLLM_VERSION} FROM ${VLLM_BASE_IMAGE} AS base diff --git a/model-engine/model_engine_server/inference/vllm/build_and_upload_image.sh b/model-engine/model_engine_server/inference/vllm/build_and_upload_image.sh index 866faabe..0026e74a 100755 --- a/model-engine/model_engine_server/inference/vllm/build_and_upload_image.sh +++ b/model-engine/model_engine_server/inference/vllm/build_and_upload_image.sh @@ -29,7 +29,7 @@ fi ACCOUNT=$1 IMAGE_TAG=$2 BUILD_TARGET=$3 -VLLM_VERSION=${VLLM_VERSION:-"0.10.1.1"} +VLLM_VERSION=${VLLM_VERSION:-"0.10.2"} VLLM_BASE_REPO=${VLLM_BASE_REPO:-"vllm/vllm-openai"} # if build target = vllm use vllm otherwise use vllm_batch diff --git a/model-engine/model_engine_server/inference/vllm/requirements-dev.txt b/model-engine/model_engine_server/inference/vllm/requirements-dev.txt index 5b1df691..3fa740b6 100644 --- a/model-engine/model_engine_server/inference/vllm/requirements-dev.txt +++ b/model-engine/model_engine_server/inference/vllm/requirements-dev.txt @@ -1 +1 @@ -vllm==0.10.1.1 +vllm==0.10.2 From 81e402f33775b61e7491ed718455779c6563e830 Mon Sep 17 00:00:00 2001 From: Michael Choi Date: Fri, 19 Sep 2025 22:20:15 +0000 Subject: [PATCH 3/5] cleanup --- model-engine/model_engine_server/inference/vllm/vllm_batch.py | 1 - 1 file changed, 1 deletion(-) diff --git a/model-engine/model_engine_server/inference/vllm/vllm_batch.py b/model-engine/model_engine_server/inference/vllm/vllm_batch.py index 3289f313..d147e791 100644 --- a/model-engine/model_engine_server/inference/vllm/vllm_batch.py +++ b/model-engine/model_engine_server/inference/vllm/vllm_batch.py @@ -303,7 +303,6 @@ async def init_engine( engine_client = AsyncLLMEngine.from_engine_args(engine_args) model_config = await engine_client.get_model_config() resolved_chat_template = load_chat_template(parsed_configs.chat_template) - parsed_configs base_model_paths = [BaseModelPath(name=served_model_name, model_path=model_id)] From 62302fee606e574540dc64ae3e5440998098929a Mon Sep 17 00:00:00 2001 From: Michael Choi Date: Wed, 24 Sep 2025 17:52:50 +0000 Subject: [PATCH 4/5] test reqs --- .../inference/vllm/requirements-batch.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/model-engine/model_engine_server/inference/vllm/requirements-batch.txt b/model-engine/model_engine_server/inference/vllm/requirements-batch.txt index d339f593..467d875f 100644 --- a/model-engine/model_engine_server/inference/vllm/requirements-batch.txt +++ b/model-engine/model_engine_server/inference/vllm/requirements-batch.txt @@ -1,8 +1,11 @@ pydantic>=2.8 boto3==1.34.15 smart-open==6.4.0 -ddtrace==2.11.0 -datadog==0.49.1 +# ddtrace==2.11.0 +ddtrace==2.21.11 +# datadog==0.49.1 +wrapt>=1.15,<2 +datadog==0.52.1 dataclasses-json~=0.6.7 sse-starlette==2.1.3 ray[client]==2.37.0 \ No newline at end of file From 0c824eef59e9af963363cfbaa0a256615a201123 Mon Sep 17 00:00:00 2001 From: Michael Choi Date: Wed, 24 Sep 2025 17:53:54 +0000 Subject: [PATCH 5/5] updates --- .../inference/vllm/Dockerfile.vllm | 2 +- .../inference/vllm/init_ray_batch_inf_v2.py | 11 +++++++++++ .../inference/vllm/requirements-batch.txt | 2 +- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/model-engine/model_engine_server/inference/vllm/Dockerfile.vllm b/model-engine/model_engine_server/inference/vllm/Dockerfile.vllm index 98cf4935..87bfb531 100644 --- a/model-engine/model_engine_server/inference/vllm/Dockerfile.vllm +++ b/model-engine/model_engine_server/inference/vllm/Dockerfile.vllm @@ -5,7 +5,7 @@ ARG VLLM_BASE_IMAGE=${VLLM_BASE_REPO}:v${VLLM_VERSION} FROM ${VLLM_BASE_IMAGE} AS base RUN apt-get update \ - && apt-get install -y wget gdb psmisc dumb-init \ + && apt-get install -y wget gdb psmisc dumb-init iproute2 netcat \ && apt-get autoremove -y \ && rm -rf /var/lib/apt/lists/* \ apt-get clean diff --git a/model-engine/model_engine_server/inference/vllm/init_ray_batch_inf_v2.py b/model-engine/model_engine_server/inference/vllm/init_ray_batch_inf_v2.py index 473df928..22a51d6a 100644 --- a/model-engine/model_engine_server/inference/vllm/init_ray_batch_inf_v2.py +++ b/model-engine/model_engine_server/inference/vllm/init_ray_batch_inf_v2.py @@ -229,7 +229,18 @@ def main(mode: str): if __name__ == "__main__": + import os + parser = argparse.ArgumentParser() parser.add_argument("--mode", choices=["wait_for_head_node_to_exit"], required=True) + # export environment variable to disable ray logging + os.environ["NCCL_DEBUG"] = "INFO" + os.environ["NCCL_DEBUG_SUBSYS"] = "INIT,NET" + # os.environ["FI_PROVIDER"] = "efa" # you’re requesting EFA devices + # os.environ["AWS_OFI_NCCL"] = "1" + os.environ["NCCL_IB_DISABLE"] = "0" + # os.environ["NCCL_SOCKET_IFNAME"] = "eth0,eth1" # include the real NICs (EFA is commonly on eth1) + os.environ["NCCL_CROSS_NIC"] = "1" # allow cross-NIC if ranks land on different NICs + os.environ["NCCL_NET_GDR_LEVEL"] = "0" args = parser.parse_args() main(args.mode) diff --git a/model-engine/model_engine_server/inference/vllm/requirements-batch.txt b/model-engine/model_engine_server/inference/vllm/requirements-batch.txt index 467d875f..d5ef1bdf 100644 --- a/model-engine/model_engine_server/inference/vllm/requirements-batch.txt +++ b/model-engine/model_engine_server/inference/vllm/requirements-batch.txt @@ -8,4 +8,4 @@ wrapt>=1.15,<2 datadog==0.52.1 dataclasses-json~=0.6.7 sse-starlette==2.1.3 -ray[client]==2.37.0 \ No newline at end of file +ray[client]==2.48.0 \ No newline at end of file