Skip to content

Commit 4af8f5d

Browse files
Merge branch 'main_20240517' into main
2 parents a1d0cd0 + c5711ef commit 4af8f5d

File tree

84 files changed

+5814
-476
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

84 files changed

+5814
-476
lines changed

.buildkite/run-amd-test.sh

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# This script build the ROCm docker image and runs test inside it.
1+
# This script runs test inside the corresponding ROCm docker container.
22
set -ex
33

44
# Print ROCm version
@@ -19,15 +19,16 @@ done
1919

2020
echo "--- Building container"
2121
sha=$(git rev-parse --short HEAD)
22-
container_name=rocm_${sha}
22+
image_name=rocm_${sha}
23+
container_name=rocm_${sha}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)
2324
docker build \
24-
-t ${container_name} \
25+
-t ${image_name} \
2526
-f Dockerfile.rocm \
2627
--progress plain \
2728
.
2829

2930
remove_docker_container() {
30-
docker rm -f ${container_name} || docker image rm -f ${container_name} || true
31+
docker rm -f ${container_name} || docker image rm -f ${image_name} || true
3132
}
3233
trap remove_docker_container EXIT
3334

@@ -39,6 +40,6 @@ docker run \
3940
--rm \
4041
-e HF_TOKEN \
4142
--name ${container_name} \
42-
${container_name} \
43+
${image_name} \
4344
/bin/bash -c "${@}"
4445

.buildkite/run-benchmarks.sh

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/.."
99
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
1010

1111
# run python-based benchmarks and upload the result to buildkite
12-
python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
12+
python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
1313
bench_latency_exit_code=$?
1414

15-
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
15+
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
1616
bench_throughput_exit_code=$?
1717

1818
# run server-based benchmarks and upload the result to buildkite
@@ -74,4 +74,5 @@ if [ $bench_serving_exit_code -ne 0 ]; then
7474
exit $bench_serving_exit_code
7575
fi
7676

77-
/workspace/buildkite-agent artifact upload openai-*.json
77+
rm ShareGPT_V3_unfiltered_cleaned_split.json
78+
/workspace/buildkite-agent artifact upload "*.json"

.buildkite/test-pipeline.yaml

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,16 @@
55

66
steps:
77
- label: Regression Test
8+
mirror_hardwares: [amd]
89
command: pytest -v -s test_regression.py
910
working_dir: "/vllm-workspace/tests" # optional
1011

1112
- label: AsyncEngine Test
13+
#mirror_hardwares: [amd]
1214
command: pytest -v -s async_engine
1315

1416
- label: Basic Correctness Test
17+
mirror_hardwares: [amd]
1518
commands:
1619
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
1720
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
@@ -24,14 +27,15 @@ steps:
2427
command: pytest -v -s core
2528

2629
- label: Distributed Comm Ops Test
30+
#mirror_hardwares: [amd]
2731
command: pytest -v -s distributed/test_comm_ops.py
2832
working_dir: "/vllm-workspace/tests"
2933
num_gpus: 2
3034

3135
- label: Distributed Tests
36+
mirror_hardwares: [amd]
3237
working_dir: "/vllm-workspace/tests"
3338
num_gpus: 2
34-
mirror_hardwares: [amd]
3539
commands:
3640
- pytest -v -s distributed/test_pynccl_library.py
3741
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
@@ -42,19 +46,22 @@ steps:
4246
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
4347
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
4448
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
49+
- pytest -v -s spec_decode/e2e/test_integration_dist.py
4550

4651
- label: Distributed Tests (Multiple Groups)
52+
#mirror_hardwares: [amd]
4753
working_dir: "/vllm-workspace/tests"
4854
num_gpus: 4
4955
commands:
5056
- pytest -v -s distributed/test_pynccl.py
5157

5258
- label: Engine Test
53-
#mirror_hardwares: [amd]
59+
mirror_hardwares: [amd]
5460
command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
5561

5662
- label: Entrypoints Test
5763
mirror_hardwares: [amd]
64+
5865
commands:
5966
# these tests have to be separated, because each one will allocate all posible GPU memory
6067
- pytest -v -s entrypoints --ignore=entrypoints/test_server_oot_registration.py
@@ -74,6 +81,7 @@ steps:
7481
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
7582

7683
- label: Kernels Test %N
84+
#mirror_hardwares: [amd]
7785
command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
7886
parallelism: 4
7987

@@ -84,7 +92,7 @@ steps:
8492
- pytest -v -s models --ignore=models/test_llava.py
8593

8694
- label: Llava Test
87-
#mirror_hardwares: [amd]
95+
mirror_hardwares: [amd]
8896
commands:
8997
- bash ../.buildkite/download-images.sh
9098
- pytest -v -s models/test_llava.py
@@ -95,6 +103,7 @@ steps:
95103
- pytest -v -s prefix_caching
96104

97105
- label: Samplers Test
106+
#mirror_hardwares: [amd]
98107
command: pytest -v -s samplers
99108

100109
- label: LogitsProcessor Test
@@ -110,16 +119,20 @@ steps:
110119
command: pytest -v -s spec_decode
111120

112121
- label: LoRA Test %N
122+
#mirror_hardwares: [amd]
113123
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
114124
parallelism: 4
115125

116126
- label: Tensorizer Test
127+
#mirror_hardwares: [amd]
117128
command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
118129

119130
- label: Metrics Test
131+
mirror_hardwares: [amd]
120132
command: pytest -v -s metrics
121133

122134
- label: Quantization Test
135+
#mirror_hardwares: [amd]
123136
command: pytest -v -s quantization
124137

125138
- label: Benchmarks

.buildkite/test-template.j2

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,8 @@
33
{% set default_working_dir = "/vllm-workspace/tests" %}
44

55
steps:
6-
76
- label: ":docker: build image"
8-
commands:
7+
commands:
98
- "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
109
- "docker push {{ docker_image }}"
1110
env:

CMakeLists.txt

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,13 +173,38 @@ set(VLLM_EXT_SRC
173173
"csrc/pybind.cpp")
174174

175175
if(VLLM_GPU_LANG STREQUAL "CUDA")
176+
include(FetchContent)
177+
SET(CUTLASS_ENABLE_HEADERS_ONLY=ON)
178+
FetchContent_Declare(
179+
cutlass
180+
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
181+
# CUTLASS 3.5.0
182+
GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc
183+
)
184+
FetchContent_MakeAvailable(cutlass)
185+
176186
list(APPEND VLLM_EXT_SRC
177187
"csrc/quantization/aqlm/gemm_kernels.cu"
178188
"csrc/quantization/awq/gemm_kernels.cu"
179-
"csrc/quantization/marlin/marlin_cuda_kernel.cu"
189+
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
190+
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
180191
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
181192
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
182-
"csrc/custom_all_reduce.cu")
193+
"csrc/custom_all_reduce.cu"
194+
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu"
195+
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu"
196+
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu")
197+
198+
#
199+
# The CUTLASS kernels for Hopper require sm90a to be enabled.
200+
# This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
201+
# That adds an extra 17MB to compiled binary, so instead we selectively enable it.
202+
set_source_files_properties(
203+
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu"
204+
PROPERTIES
205+
COMPILE_FLAGS
206+
"-gencode arch=compute_90a,code=sm_90a")
207+
183208
endif()
184209

185210
define_gpu_extension_target(
@@ -189,6 +214,7 @@ define_gpu_extension_target(
189214
SOURCES ${VLLM_EXT_SRC}
190215
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
191216
ARCHITECTURES ${VLLM_GPU_ARCHES}
217+
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
192218
WITH_SOABI)
193219

194220
#

README.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,18 @@ Easy, fast, and cheap LLM serving for everyone
1414

1515
</p>
1616

17+
---
18+
19+
**The Fourth vLLM Bay Area Meetup (June 11th 5:30pm-8pm PT)**
20+
21+
We are thrilled to announce our fourth vLLM Meetup!
22+
The vLLM team will share recent updates and roadmap.
23+
We will also have vLLM collaborators from BentoML and Cloudflare coming up to the stage to discuss their experience in deploying LLMs with vLLM.
24+
Please register [here](https://lu.ma/agivllm) and join us!
25+
26+
---
27+
1728
*Latest News* 🔥
18-
- [2024/05] We are hosting [the fourth vLLM meetup](https://lu.ma/event/manage/evt-A064fGpj52fviSn) with BentoML and Cloudflare on June 11! Please register [here](https://lu.ma/agivllm).
1929
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
2030
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
2131
- [2024/01] Added ROCm 6.0 support to vLLM.

benchmarks/benchmark_latency.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Benchmark the latency of processing a single batch of requests."""
22
import argparse
3+
import json
34
import time
45
from pathlib import Path
56
from typing import Optional
@@ -18,6 +19,8 @@ def main(args: argparse.Namespace):
1819
# NOTE(woosuk): If the request cannot be processed in a single batch,
1920
# the engine will automatically process the request in multiple batches.
2021
llm = LLM(model=args.model,
22+
speculative_model=args.speculative_model,
23+
num_speculative_tokens=args.num_speculative_tokens,
2124
tokenizer=args.tokenizer,
2225
quantization=args.quantization,
2326
tensor_parallel_size=args.tensor_parallel_size,
@@ -28,6 +31,7 @@ def main(args: argparse.Namespace):
2831
quantization_param_path=args.quantization_param_path,
2932
device=args.device,
3033
ray_workers_use_nsight=args.ray_workers_use_nsight,
34+
use_v2_block_manager=args.use_v2_block_manager,
3135
enable_chunked_prefill=args.enable_chunked_prefill,
3236
download_dir=args.download_dir,
3337
block_size=args.block_size)
@@ -93,12 +97,24 @@ def run_to_completion(profile_dir: Optional[str] = None):
9397
for percentage, percentile in zip(percentages, percentiles):
9498
print(f'{percentage}% percentile latency: {percentile} seconds')
9599

100+
# Output JSON results if specified
101+
if args.output_json:
102+
results = {
103+
"avg_latency": np.mean(latencies),
104+
"latencies": latencies.tolist(),
105+
"percentiles": dict(zip(percentages, percentiles.tolist())),
106+
}
107+
with open(args.output_json, "w") as f:
108+
json.dump(results, f, indent=4)
109+
96110

97111
if __name__ == '__main__':
98112
parser = argparse.ArgumentParser(
99113
description='Benchmark the latency of processing a single batch of '
100114
'requests till completion.')
101115
parser.add_argument('--model', type=str, default='facebook/opt-125m')
116+
parser.add_argument('--speculative-model', type=str, default=None)
117+
parser.add_argument('--num-speculative-tokens', type=int, default=None)
102118
parser.add_argument('--tokenizer', type=str, default=None)
103119
parser.add_argument('--quantization',
104120
'-q',
@@ -144,8 +160,8 @@ def run_to_completion(profile_dir: Optional[str] = None):
144160
help=
145161
'Data type for kv cache storage. If "auto", will use model data type. '
146162
'FP8_E5M2 (without scaling) is only supported on cuda version greater '
147-
'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
148-
'common inference criteria.')
163+
'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
164+
'instead supported for common inference criteria.')
149165
parser.add_argument(
150166
'--quantization-param-path',
151167
type=str,
@@ -181,6 +197,7 @@ def run_to_completion(profile_dir: Optional[str] = None):
181197
action='store_true',
182198
help='If True, the prefill requests can be chunked based on the '
183199
'max_num_batched_tokens')
200+
parser.add_argument('--use-v2-block-manager', action='store_true')
184201
parser.add_argument(
185202
"--ray-workers-use-nsight",
186203
action='store_true',
@@ -191,5 +208,10 @@ def run_to_completion(profile_dir: Optional[str] = None):
191208
default=None,
192209
help='directory to download and load the weights, '
193210
'default to the default cache dir of huggingface')
211+
parser.add_argument(
212+
'--output-json',
213+
type=str,
214+
default=None,
215+
help='Path to save the latency results in JSON format.')
194216
args = parser.parse_args()
195217
main(args)

benchmarks/benchmark_throughput.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,18 @@ def main(args: argparse.Namespace):
242242
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
243243
f"{total_num_tokens / elapsed_time:.2f} tokens/s")
244244

245+
# Output JSON results if specified
246+
if args.output_json:
247+
results = {
248+
"elapsed_time": elapsed_time,
249+
"num_requests": len(requests),
250+
"total_num_tokens": total_num_tokens,
251+
"requests_per_second": len(requests) / elapsed_time,
252+
"tokens_per_second": total_num_tokens / elapsed_time,
253+
}
254+
with open(args.output_json, "w") as f:
255+
json.dump(results, f, indent=4)
256+
245257

246258
if __name__ == "__main__":
247259
parser = argparse.ArgumentParser(description="Benchmark the throughput.")
@@ -353,6 +365,11 @@ def main(args: argparse.Namespace):
353365
default=None,
354366
help='directory to download and load the weights, '
355367
'default to the default cache dir of huggingface')
368+
parser.add_argument(
369+
'--output-json',
370+
type=str,
371+
default=None,
372+
help='Path to save the throughput results in JSON format.')
356373
args = parser.parse_args()
357374
if args.tokenizer is None:
358375
args.tokenizer = args.model

0 commit comments

Comments
 (0)