vllm-project
diff --git a/‎examples/offline_inference/disaggrated-prefill-v1/decode_example.py‎
Lines changed: 36 additions & 0 deletions b/‎examples/offline_inference/disaggrated-prefill-v1/decode_example.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎examples/offline_inference/disaggrated-prefill-v1/prefill_example.py‎
Lines changed: 42 additions & 0 deletions b/‎examples/offline_inference/disaggrated-prefill-v1/prefill_example.py‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎examples/offline_inference/disaggrated-prefill-v1/run.sh‎
Lines changed: 4 additions & 0 deletions b/‎examples/offline_inference/disaggrated-prefill-v1/run.sh‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎requirements/test.txt‎
Lines changed: 19 additions & 3 deletions b/‎requirements/test.txt‎
Lines changed: 19 additions & 3 deletions
diff --git a/‎vllm/attention/layer.py‎
Lines changed: 28 additions & 3 deletions b/‎vllm/attention/layer.py‎
Lines changed: 28 additions & 3 deletions
diff --git a/‎vllm/distributed/kv_transfer/kv_connector/factory.py‎
Lines changed: 45 additions & 8 deletions b/‎vllm/distributed/kv_transfer/kv_connector/factory.py‎
Lines changed: 45 additions & 8 deletions
diff --git a/‎vllm/distributed/kv_transfer/kv_connector/v1/__init__.py‎
Lines changed: 11 additions & 0 deletions b/‎vllm/distributed/kv_transfer/kv_connector/v1/__init__.py‎
Lines changed: 11 additions & 0 deletions
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+# Read prompts from output.txt
+prompts = []
+try:
+    with open("output.txt") as f:
+        for line in f:
+            prompts.append(line.strip())
+    print(f"Loaded {len(prompts)} prompts from output.txt")
+except FileNotFoundError:
+    print("Error: output.txt file not found")
+    exit(-1)
+
+sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
+
+llm = LLM(
+    model="meta-llama/llama-3.1-8b-instruct",
+    enforce_eager=True,
+    gpu_memory_utilization=0.8,
+    kv_transfer_config=KVTransferConfig.from_cli(
+        '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both",'
+        '"kv_connector_extra_config": {"shared_storage_path": "local_storage"}}'
+    ))  #, max_model_len=2048, max_num_batched_tokens=2048)
+
+# 1ST generation (prefill instance)
+outputs = llm.generate(prompts, sampling_params)
+
+new_prompts = []
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    new_prompts.append(prompt + generated_text)
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm import LLM, SamplingParams
+from vllm.config import KVTransferConfig
+
+context = "Hi " * 1000
+context2 = "Hey " * 500
+prompts = [
+    context + "Hello, my name is",
+    context + "The capital of France is",
+    context2 + "Your name is",
+    context2 + "The capital of China is",
+]
+
+sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
+
+llm = LLM(model="meta-llama/llama-3.1-8b-instruct",
+          enforce_eager=True,
+          gpu_memory_utilization=0.8,
+          kv_transfer_config=KVTransferConfig.from_cli(
+              '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both", '
+              '"kv_extra_config": {"shared_storage_path": "local_storage"}}')
+          )  #, max_model_len=2048, max_num_batched_tokens=2048)
+
+# 1ST generation (prefill instance)
+outputs = llm.generate(
+    prompts,
+    sampling_params,
+)
+
+new_prompts = []
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    new_prompts.append(prompt + generated_text)
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+# Write new_prompts to output.txt
+with open("output.txt", "w") as f:
+    for prompt in new_prompts:
+        f.write(prompt + "\n")
+print(f"Saved {len(new_prompts)} prompts to output.txt")
@@ -0,0 +1,4 @@
+find /tmp -iname "*attn.pt" 2>/dev/null | cut -d'/' -f1,2,3 | uniq | xargs rm -r
+
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=1 python3 prefill_example.py
+VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=1 python3 decode_example.py
@@ -23,6 +23,10 @@ anyio==4.6.2.post1
     # via httpx
 argcomplete==3.5.1
     # via datamodel-code-generator
+async-timeout==5.0.1
+    # via
+    #   aiohttp
+    #   redis
 attrs==24.2.0
     # via
     #   aiohttp
@@ -117,6 +121,10 @@ encodec==0.1.1
     # via vocos
 evaluate==0.4.3
     # via lm-eval
+exceptiongroup==1.2.2
+    # via
+    #   anyio
+    #   pytest
 fastparquet==2024.11.0
     # via genai-perf
 fastrlock==0.8.2
@@ -556,9 +564,7 @@ sentence-transformers==3.2.1
 sentencepiece==0.2.0
     # via mistral-common
 setuptools==75.8.0
-    # via
-    #   pytablewriter
-    #   torch
+    # via pytablewriter
 shellingham==1.5.4
     # via typer
 six==1.16.0
@@ -605,6 +611,12 @@ timm==1.0.11
     # via -r requirements/test.in
 tokenizers==0.21.0
     # via transformers
+toml==0.10.2
+    # via datamodel-code-generator
+tomli==2.2.1
+    # via
+    #   black
+    #   pytest
 torch==2.6.0
     # via
     #   -r requirements/test.in
@@ -670,12 +682,16 @@ typer==0.15.2
     # via fastsafetensors
 typing-extensions==4.12.2
     # via
+    #   anyio
+    #   black
     #   huggingface-hub
     #   librosa
     #   mistral-common
+    #   multidict
     #   pqdm
     #   pydantic
     #   pydantic-core
+    #   rich
     #   torch
     #   typer
 tzdata==2024.2
 
@@ -10,6 +10,7 @@
 from vllm.attention import AttentionType
 from vllm.attention.selector import backend_name_to_enum, get_attn_backend
 from vllm.config import CacheConfig, get_current_vllm_config
+from vllm.distributed import get_kv_transfer_group
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.model_executor.layers.linear import UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization.base_config import (
@@ -179,6 +180,7 @@ def forward(
         context using
         `vllm.forward_context.get_forward_context().attn_metadata`.
         """
+        get_kv_transfer_group().wait_for_layer_load(self.layer_name)
         if self.calculate_kv_scales:
             attn_metadata = get_forward_context().attn_metadata
             if attn_metadata.enable_kv_scales_calculation:
@@ -214,20 +216,26 @@ def forward(
                                   self_kv_cache,
                                   attn_metadata,
                                   output=output)
+                save_kv_layer_to_connector(self.layer_name, self.kv_cache)
             else:
                 torch.ops.vllm.unified_attention_with_output(
                     query, key, value, output, self.layer_name)
+                save_kv_layer_to_connector(self.layer_name, self.kv_cache)
             return output.view(-1, hidden_size)
         else:
             if self.use_direct_call:
                 forward_context = get_forward_context()
                 attn_metadata = forward_context.attn_metadata
                 self_kv_cache = self.kv_cache[forward_context.virtual_engine]
-                return self.impl.forward(self, query, key, value,
-                                         self_kv_cache, attn_metadata)
+                output = self.impl.forward(self, query, key, value,
+                                           self_kv_cache, attn_metadata)
+                save_kv_layer_to_connector(self.layer_name, self.kv_cache)
+                return output
             else:
-                return torch.ops.vllm.unified_attention(
+                output = torch.ops.vllm.unified_attention(
                     query, key, value, self.layer_name)
+                save_kv_layer_to_connector(self.layer_name, self.kv_cache)
+                return output
 
     def calc_kv_scales(self, query, key, value):
         self._q_scale.copy_(torch.abs(query).max() / self.q_range)
@@ -329,6 +337,23 @@ def forward(
         return out.reshape(bsz, q_len, -1)
 
 
+def save_kv_layer_to_connector(
+    layer_name: str,
+    kv_cache: List[torch.Tensor],
+):
+    forward_context: ForwardContext = get_forward_context()
+    attn_metadata = forward_context.attn_metadata
+    if attn_metadata is None:
+        return
+
+    connector = get_kv_transfer_group()
+    if connector is None:
+        return
+
+    kv_cache_layer = kv_cache[forward_context.virtual_engine]
+    connector.save_kv_layer(layer_name, kv_cache_layer, attn_metadata)
+
+
 def unified_attention(
     query: torch.Tensor,
     key: torch.Tensor,
 
@@ -1,16 +1,28 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import importlib
-from typing import TYPE_CHECKING, Callable, Dict, Type
+from typing import TYPE_CHECKING, Callable, Dict, Optional, Type, Union
+
+import vllm.envs as envs
+# NOTE(Kuntai): We prefer not to directly the classes with "_V1" suffix.
+# This makes it easier for us to deprecate code in v0 (which will happen soon).
+# yapf: disable
+from vllm.distributed.kv_transfer.kv_connector.v1 import (KVConnectorBase_V1,
+                                                          KVConnectorRole)
+# yapf: enable
+from vllm.logger import init_logger
 
 from .base import KVConnectorBase
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
 
+logger = init_logger(__name__)
+
 
 class KVConnectorFactory:
-    _registry: Dict[str, Callable[[], Type[KVConnectorBase]]] = {}
+    _registry: Dict[str, Callable[[], Type[Union[KVConnectorBase,
+                                                 KVConnectorBase_V1]]]] = {}
 
     @classmethod
     def register_connector(cls, name: str, module_path: str,
@@ -19,21 +31,41 @@ def register_connector(cls, name: str, module_path: str,
         if name in cls._registry:
             raise ValueError(f"Connector '{name}' is already registered.")
 
-        def loader() -> Type[KVConnectorBase]:
+        def loader() -> Type[Union[KVConnectorBase, KVConnectorBase_V1]]:
             module = importlib.import_module(module_path)
             return getattr(module, class_name)
 
         cls._registry[name] = loader
 
     @classmethod
-    def create_connector(cls, rank: int, local_rank: int,
-                         config: "VllmConfig") -> KVConnectorBase:
+    def create_connector(
+            cls, rank: Optional[int], local_rank: Optional[int],
+            config: "VllmConfig", role: KVConnectorRole
+    ) -> Union[KVConnectorBase, KVConnectorBase_V1]:
         connector_name = config.kv_transfer_config.kv_connector
         if connector_name not in cls._registry:
             raise ValueError(f"Unsupported connector type: {connector_name}")
 
-        connector_cls = cls._registry[connector_name]()
-        return connector_cls(rank, local_rank, config)
+        if envs.VLLM_USE_V1:
+            # NOTE(Kuntai): v1 connector is explicitly separated into two roles.
+            # Scheduler connector:
+            # - Co-colate with scheduler process
+            # - Should only be used inside the Scheduler class
+            # Worker connector:
+            # - Co-locate with worker process
+            # - Should only be used inside the forward context & attention layer
+            # We build these two connectors separately to enforce strict
+            # separation
+            connector_cls_v1 = cls._registry[connector_name]()
+            assert issubclass(connector_cls_v1, KVConnectorBase_V1)
+            logger.info("Creating v1 connector with name: %s", connector_name)
+            return connector_cls_v1(rank, local_rank, config, role)
+        else:
+            assert rank is not None
+            assert local_rank is not None
+            connector_cls = cls._registry[connector_name]()
+            assert issubclass(connector_cls, KVConnectorBase)
+            return connector_cls(rank, local_rank, config)
 
 
 # Register various connectors here.
@@ -57,4 +89,9 @@ def create_connector(cls, rank: int, local_rank: int,
 KVConnectorFactory.register_connector(
     "MooncakeStoreConnector",
     "vllm.distributed.kv_transfer.kv_connector.mooncake_store_connector",
-    "MooncakeStoreConnector")
+    "MooncakeStoreConnector")
+
+KVConnectorFactory.register_connector(
+    "SharedStorageConnector",
+    "vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector",
+    "SharedStorageConnector")
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: Apache-2.0
+# yapf: disable
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1, KVConnectorRole)
+
+# yapf: enable
+
+__all__ = [
+    "KVConnectorRole",
+    "KVConnectorBase_V1",
+]