Alexei-V-Ivanov-AMD
diff --git a/‎.buildkite/run-amd-test.sh
Lines changed: 6 additions & 5 deletions b/‎.buildkite/run-amd-test.sh
Lines changed: 6 additions & 5 deletions
diff --git a/‎.buildkite/run-benchmarks.sh
Lines changed: 4 additions & 3 deletions b/‎.buildkite/run-benchmarks.sh
Lines changed: 4 additions & 3 deletions
diff --git a/‎.buildkite/test-pipeline.yaml
Lines changed: 16 additions & 3 deletions b/‎.buildkite/test-pipeline.yaml
Lines changed: 16 additions & 3 deletions
diff --git a/‎.buildkite/test-template.j2
Lines changed: 1 addition & 2 deletions b/‎.buildkite/test-template.j2
Lines changed: 1 addition & 2 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 28 additions & 2 deletions b/‎CMakeLists.txt
Lines changed: 28 additions & 2 deletions
diff --git a/‎README.md
Lines changed: 11 additions & 1 deletion b/‎README.md
Lines changed: 11 additions & 1 deletion
diff --git a/‎benchmarks/benchmark_latency.py
Lines changed: 24 additions & 2 deletions b/‎benchmarks/benchmark_latency.py
Lines changed: 24 additions & 2 deletions
diff --git a/‎benchmarks/benchmark_throughput.py
Lines changed: 17 additions & 0 deletions b/‎benchmarks/benchmark_throughput.py
Lines changed: 17 additions & 0 deletions
@@ -1,4 +1,4 @@
-# This script build the ROCm docker image and runs test inside it.
+# This script runs test inside the corresponding ROCm docker container.
 set -ex
 
 # Print ROCm version
@@ -19,15 +19,16 @@ done
 
 echo "--- Building container"
 sha=$(git rev-parse --short HEAD)
-container_name=rocm_${sha}
+image_name=rocm_${sha}
+container_name=rocm_${sha}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)
 docker build \
-        -t ${container_name} \
+        -t ${image_name} \
         -f Dockerfile.rocm \
         --progress plain \
         .
 
 remove_docker_container() {
-   docker rm -f ${container_name} || docker image rm -f ${container_name} || true
+   docker rm -f ${container_name} || docker image rm -f ${image_name} || true
 }
 trap remove_docker_container EXIT
 
@@ -39,6 +40,6 @@ docker run \
         --rm \
         -e HF_TOKEN \
         --name ${container_name} \
-        ${container_name} \
+        ${image_name} \
         /bin/bash -c "${@}"
 
@@ -9,10 +9,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/.."
 (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
 
 # run python-based benchmarks and upload the result to buildkite
-python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
+python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
 bench_latency_exit_code=$?
 
-python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
+python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
 bench_throughput_exit_code=$?
 
 # run server-based benchmarks and upload the result to buildkite
@@ -74,4 +74,5 @@ if [ $bench_serving_exit_code -ne 0 ]; then
     exit $bench_serving_exit_code
 fi
 
-/workspace/buildkite-agent artifact upload openai-*.json
+rm ShareGPT_V3_unfiltered_cleaned_split.json
+/workspace/buildkite-agent artifact upload "*.json"
@@ -5,13 +5,16 @@
 
 steps:
 - label: Regression Test
+  mirror_hardwares: [amd]
   command: pytest -v -s test_regression.py
   working_dir: "/vllm-workspace/tests" # optional
 
 - label: AsyncEngine Test
+  #mirror_hardwares: [amd]
   command: pytest -v -s async_engine
 
 - label: Basic Correctness Test
+  mirror_hardwares: [amd]
   commands:
   - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
   - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
@@ -24,14 +27,15 @@ steps:
   command: pytest -v -s core
 
 - label: Distributed Comm Ops Test
+  #mirror_hardwares: [amd]
   command: pytest -v -s distributed/test_comm_ops.py
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
 
 - label: Distributed Tests
+  mirror_hardwares: [amd]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
-  mirror_hardwares: [amd]
   commands:
   - pytest -v -s distributed/test_pynccl_library.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
@@ -42,19 +46,22 @@ steps:
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist.py 
 
 - label: Distributed Tests (Multiple Groups)
+  #mirror_hardwares: [amd]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   commands:
   - pytest -v -s distributed/test_pynccl.py
 
 - label: Engine Test
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
   command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
 
 - label: Entrypoints Test
   mirror_hardwares: [amd]
+
   commands:
   # these tests have to be separated, because each one will allocate all posible GPU memory
   - pytest -v -s entrypoints --ignore=entrypoints/test_server_oot_registration.py
@@ -74,6 +81,7 @@ steps:
     - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
 
 - label: Kernels Test %N
+  #mirror_hardwares: [amd]
   command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 4
 
@@ -84,7 +92,7 @@ steps:
     - pytest -v -s models --ignore=models/test_llava.py
 
 - label: Llava Test
-  #mirror_hardwares: [amd]
+  mirror_hardwares: [amd]
   commands:
     - bash ../.buildkite/download-images.sh
     - pytest -v -s models/test_llava.py
@@ -95,6 +103,7 @@ steps:
     - pytest -v -s prefix_caching
 
 - label: Samplers Test
+  #mirror_hardwares: [amd]
   command: pytest -v -s samplers
 
 - label: LogitsProcessor Test
@@ -110,16 +119,20 @@ steps:
   command: pytest -v -s spec_decode
 
 - label: LoRA Test %N
+  #mirror_hardwares: [amd]
   command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 4
 
 - label: Tensorizer Test
+  #mirror_hardwares: [amd]
   command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
 
 - label: Metrics Test
+  mirror_hardwares: [amd]
   command: pytest -v -s metrics
 
 - label: Quantization Test
+  #mirror_hardwares: [amd]
   command: pytest -v -s quantization
 
 - label: Benchmarks
 
@@ -3,9 +3,8 @@
 {% set default_working_dir = "/vllm-workspace/tests" %}
 
 steps:
-
   - label: ":docker: build image"
-    commands:
+    commands: 
       - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
       - "docker push {{ docker_image }}"
     env:
 
@@ -173,13 +173,38 @@ set(VLLM_EXT_SRC
   "csrc/pybind.cpp")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
+  include(FetchContent)
+  SET(CUTLASS_ENABLE_HEADERS_ONLY=ON)
+  FetchContent_Declare(
+        cutlass 
+        GIT_REPOSITORY https://github.com/nvidia/cutlass.git
+        # CUTLASS 3.5.0
+        GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc
+  )
+  FetchContent_MakeAvailable(cutlass)
+
   list(APPEND VLLM_EXT_SRC
     "csrc/quantization/aqlm/gemm_kernels.cu"
     "csrc/quantization/awq/gemm_kernels.cu"
-    "csrc/quantization/marlin/marlin_cuda_kernel.cu"
+    "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
+    "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
     "csrc/quantization/gptq_marlin/gptq_marlin.cu"
     "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
-    "csrc/custom_all_reduce.cu")
+    "csrc/custom_all_reduce.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu"
+    "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu")
+
+  #
+  # The CUTLASS kernels for Hopper require sm90a to be enabled.
+  # This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
+  # That adds an extra 17MB to compiled binary, so instead we selectively enable it.
+  set_source_files_properties(
+      "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu"
+      PROPERTIES
+      COMPILE_FLAGS
+      "-gencode arch=compute_90a,code=sm_90a")
+
 endif()
 
 define_gpu_extension_target(
@@ -189,6 +214,7 @@ define_gpu_extension_target(
   SOURCES ${VLLM_EXT_SRC}
   COMPILE_FLAGS ${VLLM_GPU_FLAGS}
   ARCHITECTURES ${VLLM_GPU_ARCHES}
+  INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
   WITH_SOABI)
 
 #
 
@@ -14,8 +14,18 @@ Easy, fast, and cheap LLM serving for everyone
 
 </p>
 
+---
+
+**The Fourth vLLM Bay Area Meetup (June 11th 5:30pm-8pm PT)**
+
+We are thrilled to announce our fourth vLLM Meetup!
+The vLLM team will share recent updates and roadmap.
+We will also have vLLM collaborators from BentoML and Cloudflare coming up to the stage to discuss their experience in deploying LLMs with vLLM.
+Please register [here](https://lu.ma/agivllm) and join us!
+
+---
+
 *Latest News* 🔥
-- [2024/05] We are hosting [the fourth vLLM meetup](https://lu.ma/event/manage/evt-A064fGpj52fviSn) with BentoML and Cloudflare on June 11! Please register [here](https://lu.ma/agivllm).
 - [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
 - [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
 - [2024/01] Added ROCm 6.0 support to vLLM.
 
@@ -1,5 +1,6 @@
 """Benchmark the latency of processing a single batch of requests."""
 import argparse
+import json
 import time
 from pathlib import Path
 from typing import Optional
@@ -18,6 +19,8 @@ def main(args: argparse.Namespace):
     # NOTE(woosuk): If the request cannot be processed in a single batch,
     # the engine will automatically process the request in multiple batches.
     llm = LLM(model=args.model,
+              speculative_model=args.speculative_model,
+              num_speculative_tokens=args.num_speculative_tokens,
               tokenizer=args.tokenizer,
               quantization=args.quantization,
               tensor_parallel_size=args.tensor_parallel_size,
@@ -28,6 +31,7 @@ def main(args: argparse.Namespace):
               quantization_param_path=args.quantization_param_path,
               device=args.device,
               ray_workers_use_nsight=args.ray_workers_use_nsight,
+              use_v2_block_manager=args.use_v2_block_manager,
               enable_chunked_prefill=args.enable_chunked_prefill,
               download_dir=args.download_dir,
               block_size=args.block_size)
@@ -93,12 +97,24 @@ def run_to_completion(profile_dir: Optional[str] = None):
     for percentage, percentile in zip(percentages, percentiles):
         print(f'{percentage}% percentile latency: {percentile} seconds')
 
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "avg_latency": np.mean(latencies),
+            "latencies": latencies.tolist(),
+            "percentiles": dict(zip(percentages, percentiles.tolist())),
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(
         description='Benchmark the latency of processing a single batch of '
         'requests till completion.')
     parser.add_argument('--model', type=str, default='facebook/opt-125m')
+    parser.add_argument('--speculative-model', type=str, default=None)
+    parser.add_argument('--num-speculative-tokens', type=int, default=None)
     parser.add_argument('--tokenizer', type=str, default=None)
     parser.add_argument('--quantization',
                         '-q',
@@ -144,8 +160,8 @@ def run_to_completion(profile_dir: Optional[str] = None):
         help=
         'Data type for kv cache storage. If "auto", will use model data type. '
         'FP8_E5M2 (without scaling) is only supported on cuda version greater '
-        'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
-        'common inference criteria.')
+        'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
+        'instead supported for common inference criteria.')
     parser.add_argument(
         '--quantization-param-path',
         type=str,
@@ -181,6 +197,7 @@ def run_to_completion(profile_dir: Optional[str] = None):
         action='store_true',
         help='If True, the prefill requests can be chunked based on the '
         'max_num_batched_tokens')
+    parser.add_argument('--use-v2-block-manager', action='store_true')
     parser.add_argument(
         "--ray-workers-use-nsight",
         action='store_true',
@@ -191,5 +208,10 @@ def run_to_completion(profile_dir: Optional[str] = None):
                         default=None,
                         help='directory to download and load the weights, '
                         'default to the default cache dir of huggingface')
+    parser.add_argument(
+        '--output-json',
+        type=str,
+        default=None,
+        help='Path to save the latency results in JSON format.')
     args = parser.parse_args()
     main(args)
@@ -242,6 +242,18 @@ def main(args: argparse.Namespace):
     print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
           f"{total_num_tokens / elapsed_time:.2f} tokens/s")
 
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "elapsed_time": elapsed_time,
+            "num_requests": len(requests),
+            "total_num_tokens": total_num_tokens,
+            "requests_per_second": len(requests) / elapsed_time,
+            "tokens_per_second": total_num_tokens / elapsed_time,
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Benchmark the throughput.")
@@ -353,6 +365,11 @@ def main(args: argparse.Namespace):
                         default=None,
                         help='directory to download and load the weights, '
                         'default to the default cache dir of huggingface')
+    parser.add_argument(
+        '--output-json',
+        type=str,
+        default=None,
+        help='Path to save the throughput results in JSON format.')
     args = parser.parse_args()
     if args.tokenizer is None:
         args.tokenizer = args.model