Alexei-V-Ivanov-AMD
diff --git a/‎benchmarks/benchmark_latency.py
Lines changed: 6 additions & 8 deletions b/‎benchmarks/benchmark_latency.py
Lines changed: 6 additions & 8 deletions
diff --git a/‎benchmarks/benchmark_throughput.py
Lines changed: 5 additions & 7 deletions b/‎benchmarks/benchmark_throughput.py
Lines changed: 5 additions & 7 deletions
diff --git a/‎benchmarks/kernels/benchmark_paged_attention.py
Lines changed: 4 additions & 6 deletions b/‎benchmarks/kernels/benchmark_paged_attention.py
Lines changed: 4 additions & 6 deletions
diff --git a/‎csrc/punica/bgmv/bgmv_config.h
Lines changed: 4 additions & 0 deletions b/‎csrc/punica/bgmv/bgmv_config.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎tests/lora/test_punica.py
Lines changed: 2 additions & 0 deletions b/‎tests/lora/test_punica.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎tests/models/test_fp8.py
Lines changed: 52 additions & 28 deletions b/‎tests/models/test_fp8.py
Lines changed: 52 additions & 28 deletions
diff --git a/‎vllm/attention/layer.py
Lines changed: 25 additions & 2 deletions b/‎vllm/attention/layer.py
Lines changed: 25 additions & 2 deletions
diff --git a/‎vllm/config.py
Lines changed: 3 additions & 5 deletions b/‎vllm/config.py
Lines changed: 3 additions & 5 deletions
diff --git a/‎vllm/engine/arg_utils.py
Lines changed: 3 additions & 4 deletions b/‎vllm/engine/arg_utils.py
Lines changed: 3 additions & 4 deletions
diff --git a/‎vllm/engine/async_llm_engine.py
Lines changed: 9 additions & 1 deletion b/‎vllm/engine/async_llm_engine.py
Lines changed: 9 additions & 1 deletion
diff --git a/‎vllm/engine/llm_engine.py
Lines changed: 8 additions & 0 deletions b/‎vllm/engine/llm_engine.py
Lines changed: 8 additions & 0 deletions
@@ -153,15 +153,13 @@ def run_to_completion(profile_dir: Optional[str] = None):
                         action='store_true',
                         help='enforce eager mode and disable CUDA graph')
     parser.add_argument(
-        "--kv-cache-dtype",
+        '--kv-cache-dtype',
         type=str,
-        choices=['auto', 'fp8'],
-        default='auto',
-        help=
-        'Data type for kv cache storage. If "auto", will use model data type. '
-        'FP8_E5M2 (without scaling) is only supported on cuda version greater '
-        'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
-        'instead supported for common inference criteria.')
+        choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
+        default="auto",
+        help='Data type for kv cache storage. If "auto", will use model '
+        'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
+        'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
     parser.add_argument(
         '--quantization-param-path',
         type=str,
 
@@ -323,15 +323,13 @@ def main(args: argparse.Namespace):
                         action="store_true",
                         help="enforce eager execution")
     parser.add_argument(
-        "--kv-cache-dtype",
+        '--kv-cache-dtype',
         type=str,
-        choices=["auto", "fp8"],
+        choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
         default="auto",
-        help=
-        'Data type for kv cache storage. If "auto", will use model data type. '
-        'FP8_E5M2 (without scaling) is only supported on cuda version greater '
-        'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
-        'common inference criteria.')
+        help='Data type for kv cache storage. If "auto", will use model '
+        'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
+        'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
     parser.add_argument(
         '--quantization-param-path',
         type=str,
 
@@ -183,13 +183,11 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
     parser.add_argument(
         "--kv-cache-dtype",
         type=str,
-        choices=["auto", "fp8"],
+        choices=["auto", "fp8", "fp8_e5m2", "fp8_e4m3"],
         default="auto",
-        help=
-        'Data type for kv cache storage. If "auto", will use model data type. '
-        'FP8_E5M2 (without scaling) is only supported on cuda version greater '
-        'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
-        'common inference criteria.')
+        help="Data type for kv cache storage. If 'auto', will use model "
+        "data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. "
+        "ROCm (AMD GPU) supports fp8 (=fp8_e4m3)")
     args = parser.parse_args()
     print(args)
 
 
@@ -28,6 +28,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 2752) \
     f(in_T, out_T, W_T, narrow, 2816) \
     f(in_T, out_T, W_T, narrow, 3072) \
+    f(in_T, out_T, W_T, narrow, 3328) \
     f(in_T, out_T, W_T, narrow, 3456) \
     f(in_T, out_T, W_T, narrow, 3584) \
     f(in_T, out_T, W_T, narrow, 4096) \
@@ -36,6 +37,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, narrow, 5504) \
     f(in_T, out_T, W_T, narrow, 5632) \
     f(in_T, out_T, W_T, narrow, 6144) \
+    f(in_T, out_T, W_T, narrow, 6400) \
     f(in_T, out_T, W_T, narrow, 6848) \
     f(in_T, out_T, W_T, narrow, 6912) \
     f(in_T, out_T, W_T, narrow, 7168) \
@@ -97,6 +99,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, 2752, narrow) \
     f(in_T, out_T, W_T, 2816, narrow) \
     f(in_T, out_T, W_T, 3072, narrow) \
+    f(in_T, out_T, W_T, 3328, narrow) \
     f(in_T, out_T, W_T, 3456, narrow) \
     f(in_T, out_T, W_T, 3584, narrow) \
     f(in_T, out_T, W_T, 4096, narrow) \
@@ -105,6 +108,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
     f(in_T, out_T, W_T, 5504, narrow) \
     f(in_T, out_T, W_T, 5632, narrow) \
     f(in_T, out_T, W_T, 6144, narrow) \
+    f(in_T, out_T, W_T, 6400, narrow) \
     f(in_T, out_T, W_T, 6848, narrow) \
     f(in_T, out_T, W_T, 6912, narrow) \
     f(in_T, out_T, W_T, 7168, narrow) \
 
@@ -58,6 +58,7 @@ def _lora_ref_impl(
     2560,
     2752,
     3072,
+    3328,
     3456,
     3584,
     4096,
@@ -66,6 +67,7 @@ def _lora_ref_impl(
     5504,
     5632,
     6144,
+    6400,
     6848,
     6912,
     7168,
 
@@ -16,31 +16,55 @@
 MAX_MODEL_LEN = 1024
 
 MODELS = [
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV",
     "meta-llama/Meta-Llama-3-8B-Instruct",
 ]
 
 EXPECTED_STRS_MAP = {
-    "nm-testing/Meta-Llama-3-8B-Instruct-FP8": [
-        'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (',
-        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-        'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
-        'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
-        'Zeta-5, a highly advanced robot designed for menial labor, whirred to a',
-        'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
-        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-        'Here are the translations:\n\n**Japanese:** (Haya aki no tori, guri o',
-    ],
-    "meta-llama/Meta-Llama-3-8B-Instruct": [
-        'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
-        'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
-        'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
-        'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
-        'In the vast, sterile laboratory, Robot 3456-Alpha, or "Alpha" for short',
-        'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
-        'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
-        'Here are the translations:\n\n**Japanese:** (Haya aki wa mushi o tsukamu'
-    ],
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV": {
+        "auto": [
+            'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (',
+            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+            'Artificial intelligence (AI) and human intelligence (HI) process information in distinct ways, with both',
+            'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
+            'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep',
+            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
+            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+            'Here are the translations:\n\n**Japanese:** (Haya aki no tori, nemuri no'
+        ],
+        "fp8": [
+            'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
+            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
+            'A neural network is a complex system made up of several basic components that work together to enable it to',
+            'Zeta-5, a highly advanced robot designed for menial labor, had never experienced anything like',
+            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here',
+            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+            'Here are the translations:\n\n**Japanese:** (Haya kotori wa mushi o tsuk'
+        ]
+    },
+    "meta-llama/Meta-Llama-3-8B-Instruct": {
+        "auto": [
+            'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
+            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
+            'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne',
+            'In the vast, sterile laboratory, Robot 3456-Alpha, or "Alpha" for short',
+            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
+            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+            'Here are the translations:\n\n**Japanese:** (Haya aki wa mushi o tsukamu'
+        ],
+        "fp8": [
+            'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained',
+            'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ',
+            'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.',
+            'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne',
+            'In the year 2154, robotics engineer Dr. Rachel Kim had spent years perfecting her latest',
+            'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The',
+            'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of',
+            'Here are the translations:\n\n**Japanese:** (Haya tori, mushi o tsukamu'
+        ]
+    },
 }
 
 capability = torch.cuda.get_device_capability()
@@ -52,14 +76,14 @@
 @pytest.mark.skipif(fp8_not_supported,
                     reason="fp8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_name", MODELS)
-def test_models(
-    example_prompts,
-    model_name,
-) -> None:
+@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
+def test_models(example_prompts, model_name, kv_cache_dtype) -> None:
     model = LLM(model=model_name,
                 max_model_len=MAX_MODEL_LEN,
+                trust_remote_code=True,
                 enforce_eager=True,
-                quantization="fp8")
+                quantization="fp8",
+                kv_cache_dtype=kv_cache_dtype)
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     formatted_prompts = [
@@ -81,8 +105,8 @@ def test_models(
         generations.append(outputs[0].outputs[0].text)
     del model
 
-    print(generations)
-    expected_strs = EXPECTED_STRS_MAP[model_name]
+    print(model_name, kv_cache_dtype, generations)
+    expected_strs = EXPECTED_STRS_MAP[model_name][kv_cache_dtype]
     for i in range(len(example_prompts)):
         generated_str = generations[i]
         expected_str = expected_strs[i]
 
@@ -7,6 +7,8 @@
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.attention.selector import get_attn_backend
 from vllm.config import CacheConfig
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
 
 
 class Attention(nn.Module):
@@ -30,6 +32,7 @@ def __init__(
         alibi_slopes: Optional[List[float]] = None,
         sliding_window: Optional[int] = None,
         cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
     ) -> None:
         super().__init__()
         if cache_config is not None:
@@ -40,6 +43,27 @@ def __init__(
             block_size = 16
         if num_kv_heads is None:
             num_kv_heads = num_heads
+
+        # The default kv_scale is set to 1.0. This is ignored
+        # when kv-cache is not fp8, and should be used with
+        # kv-cache in fp8_e5m2. For kv-cache in fp8_e4m3, we
+        # expect the pre-quantized kv_scale to be loaded along
+        # with the model weights.
+        self.kv_cache_dtype = kv_cache_dtype
+        self._kv_scale = 1.0
+        quant_method = quant_config.get_quant_method(
+            self) if quant_config else None
+        if quant_method is not None:
+            if self.kv_cache_dtype == "fp8_e5m2":
+                raise ValueError("fp8_e5m2 kv-cache is not supported with "
+                                 "fp8 checkpoints.")
+            # When FP8 quantization is enabled, we make a parameter
+            # "kv_scale" so that it can be loaded from FP8 checkpoint.
+            # The kv_scale will then be converted back
+            # to self._kv_scale in a native float32 value after weight loading.
+            self.quant_method = quant_method
+            self.quant_method.create_weights(self)
+
         # During model initialization, the default dtype is set as the model
         # weight and activation dtype.
         dtype = torch.get_default_dtype()
@@ -57,10 +81,9 @@ def forward(
         value: torch.Tensor,
         kv_cache: Optional[torch.Tensor],
         attn_metadata: AttentionMetadata,
-        kv_scale: float = 1.0,
     ) -> torch.Tensor:
         return self.impl.forward(query, key, value, kv_cache, attn_metadata,
-                                 kv_scale)
+                                 self._kv_scale)
 
     def extra_repr(self) -> str:
         s = f"head_size={self.impl.head_size}"  # type: ignore
 
@@ -355,14 +355,12 @@ def _verify_args(self) -> None:
     def _verify_cache_dtype(self) -> None:
         if self.cache_dtype == "auto":
             pass
-        elif self.cache_dtype == "fp8":
+        elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2"):
             logger.info(
                 "Using fp8 data type to store kv cache. It reduces the GPU "
                 "memory footprint and boosts the performance. "
-                "But it may cause slight accuracy drop without scaling "
-                "factors. FP8_E5M2 (without scaling) is only supported on "
-                "cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 "
-                "is instead supported for common inference criteria.")
+                "Meanwhile, it may cause accuracy drop without a proper "
+                "scaling factor")
         else:
             raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}")
 
 
@@ -191,12 +191,11 @@ def add_cli_args(
         parser.add_argument(
             '--kv-cache-dtype',
             type=str,
-            choices=['auto', 'fp8'],
+            choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
             default=EngineArgs.kv_cache_dtype,
             help='Data type for kv cache storage. If "auto", will use model '
-            'data type. FP8_E5M2 (without scaling) is only supported on cuda '
-            'version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead '
-            'supported for common inference criteria.')
+            'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
+            'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
         parser.add_argument(
             '--quantization-param-path',
             type=nullable_str,
 
@@ -234,6 +234,14 @@ async def step_async(
         # Log stats.
         self.do_log_stats(scheduler_outputs, output)
 
+        if not request_outputs:
+            # Stop the execute model loop in parallel workers until there are
+            # more requests to process. This avoids waiting indefinitely in
+            # torch.distributed ops which may otherwise timeout, and unblocks
+            # the RPC thread in the workers so that they can process any other
+            # queued control plane messages, such as add/remove lora adapters.
+            await self.model_executor.stop_remote_worker_execution_loop_async()
+
         return request_outputs
 
     async def encode_request_async(
@@ -687,7 +695,7 @@ async def encode(
             multi_modal_data: Multi modal data per request.
 
         Yields:
-            The output `EmbeddingRequestOutput` objects from the LLMEngine 
+            The output `EmbeddingRequestOutput` objects from the LLMEngine
             for the request.
 
         Details:
 
@@ -692,6 +692,14 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
         # Log stats.
         self.do_log_stats(scheduler_outputs, output)
 
+        if not request_outputs:
+            # Stop the execute model loop in parallel workers until there are
+            # more requests to process. This avoids waiting indefinitely in
+            # torch.distributed ops which may otherwise timeout, and unblocks
+            # the RPC thread in the workers so that they can process any other
+            # queued control plane messages, such as add/remove lora adapters.
+            self.model_executor.stop_remote_worker_execution_loop()
+
         return request_outputs
 
     def do_log_stats(
-Original file line number
+Diff line change
 ,
 ,
 ,
 +    3328,
 ,
 ,
 ,
 ,
 ,
 ,
 +    6400,
 ,
 ,
 ,