Skip to content

Commit 361c63d

Browse files
authored
Merge pull request #297 from ROCm/upstream_merge_24_11_25
Upstream merge 24/11/25 and 24/12/2
2 parents 6cf8eb4 + a8b5334 commit 361c63d

File tree

456 files changed

+22072
-8257
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

456 files changed

+22072
-8257
lines changed

.buildkite/nightly-benchmarks/benchmark-pipeline.yaml

Lines changed: 42 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,11 @@ steps:
99
- image: badouralix/curl-jq
1010
command:
1111
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
12+
1213
- wait
14+
1315
- label: "A100"
16+
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
1417
agents:
1518
queue: A100
1619
plugins:
@@ -41,20 +44,43 @@ steps:
4144
- name: devshm
4245
emptyDir:
4346
medium: Memory
44-
# - label: "H100"
45-
# agents:
46-
# queue: H100
47-
# plugins:
48-
# - docker#v5.11.0:
49-
# image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
50-
# command:
51-
# - bash
52-
# - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
53-
# mount-buildkite-agent: true
54-
# propagate-environment: true
55-
# ipc: host
56-
# gpus: all
57-
# environment:
58-
# - VLLM_USAGE_SOURCE
59-
# - HF_TOKEN
6047

48+
- label: "H200"
49+
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
50+
agents:
51+
queue: H200
52+
plugins:
53+
- docker#v5.12.0:
54+
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
55+
command:
56+
- bash
57+
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
58+
mount-buildkite-agent: true
59+
propagate-environment: true
60+
ipc: host
61+
gpus: 4,5,6,7
62+
volumes:
63+
- /data/benchmark-hf-cache:/root/.cache/huggingface
64+
environment:
65+
- VLLM_USAGE_SOURCE
66+
- HF_TOKEN
67+
68+
- label: "H100"
69+
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
70+
agents:
71+
queue: H100
72+
plugins:
73+
- docker#v5.12.0:
74+
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
75+
command:
76+
- bash
77+
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
78+
mount-buildkite-agent: true
79+
propagate-environment: true
80+
ipc: host
81+
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
82+
volumes:
83+
- /data/benchmark-hf-cache:/root/.cache/huggingface
84+
environment:
85+
- VLLM_USAGE_SOURCE
86+
- HF_TOKEN

.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,18 @@ def results_to_json(latency, throughput, serving):
157157
throughput_results,
158158
serving_results)
159159

160+
for df in [latency_results, serving_results, throughput_results]:
161+
if df.empty:
162+
continue
163+
164+
# Sort all dataframes by their respective "Test name" columns
165+
df.sort_values(by="Test name", inplace=True)
166+
167+
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
168+
# we want to turn it into "8xGPUTYPE"
169+
df["GPU"] = df["GPU"].apply(
170+
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
171+
160172
# get markdown tables
161173
latency_md_table = tabulate(latency_results,
162174
headers='keys',

.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
# Do not set -e, as the mixtral 8x22B model tends to crash occasionally
88
# and we still want to see other benchmarking results even when mixtral crashes.
9+
set -x
910
set -o pipefail
1011

1112
check_gpus() {
@@ -85,11 +86,7 @@ kill_gpu_processes() {
8586

8687
ps -aux
8788
lsof -t -i:8000 | xargs -r kill -9
88-
pkill -f pt_main_thread
89-
# this line doesn't work now
90-
# ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
91-
pkill -f python3
92-
pkill -f /usr/bin/python3
89+
pgrep python3 | xargs -r kill -9
9390

9491

9592
# wait until GPU memory usage smaller than 1GB
@@ -289,7 +286,7 @@ run_serving_tests() {
289286
# run the server
290287
echo "Running test case $test_name"
291288
echo "Server command: $server_command"
292-
eval "$server_command" &
289+
bash -c "$server_command" &
293290
server_pid=$!
294291

295292
# wait until the server is alive
@@ -322,7 +319,7 @@ run_serving_tests() {
322319
echo "Running test case $test_name with qps $qps"
323320
echo "Client command: $client_command"
324321

325-
eval "$client_command"
322+
bash -c "$client_command"
326323

327324
# record the benchmarking commands
328325
jq_output=$(jq -n \

.buildkite/run-amd-test.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,6 @@ if [[ $commands == *" kernels "* ]]; then
8585
--ignore=kernels/test_encoder_decoder_attn.py \
8686
--ignore=kernels/test_flash_attn.py \
8787
--ignore=kernels/test_flashinfer.py \
88-
--ignore=kernels/test_gguf.py \
8988
--ignore=kernels/test_int8_quant.py \
9089
--ignore=kernels/test_machete_gemm.py \
9190
--ignore=kernels/test_mamba_ssm.py \

.buildkite/run-cpu-test-ppc64le.sh

Lines changed: 3 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -4,49 +4,11 @@
44
# It serves a sanity check for compilation and basic model usage.
55
set -ex
66

7-
# Try building the docker image
8-
docker build -t cpu-test -f Dockerfile.ppc64le .
9-
107
# Setup cleanup
11-
remove_docker_container() { docker rm -f cpu-test || true; }
8+
remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
129
trap remove_docker_container EXIT
1310
remove_docker_container
1411

15-
# Run the image, setting --shm-size=4g for tensor parallel.
16-
source /etc/environment
17-
#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
18-
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN="$HF_TOKEN" --name cpu-test cpu-test
19-
20-
function cpu_tests() {
21-
set -e
22-
23-
# Run basic model test
24-
docker exec cpu-test bash -c "
25-
set -e
26-
pip install pytest pytest-asyncio \
27-
decord einops librosa peft Pillow sentence-transformers soundfile \
28-
transformers_stream_generator matplotlib datamodel_code_generator
29-
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
30-
pytest -v -s tests/models/decoder_only/language -m cpu_model
31-
pytest -v -s tests/models/embedding/language -m cpu_model
32-
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
33-
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
34-
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
35-
36-
# online inference
37-
docker exec cpu-test bash -c "
38-
set -e
39-
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
40-
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
41-
python3 benchmarks/benchmark_serving.py \
42-
--backend vllm \
43-
--dataset-name random \
44-
--model facebook/opt-125m \
45-
--num-prompts 20 \
46-
--endpoint /v1/completions \
47-
--tokenizer facebook/opt-125m"
48-
}
12+
# Try building the docker image
13+
docker build -t cpu-test -f Dockerfile.ppc64le .
4914

50-
# All of CPU tests are expected to be finished less than 25 mins.
51-
export -f cpu_tests
52-
timeout 25m bash -c "cpu_tests"

.buildkite/run-cpu-test.sh

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,26 +13,27 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.
1313
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
1414

1515
# Setup cleanup
16-
remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
16+
remove_docker_container() { docker rm -f cpu-test-"$NUMA_NODE" cpu-test-avx2-"$NUMA_NODE" || true; }
1717
trap remove_docker_container EXIT
1818
remove_docker_container
1919

2020
# Run the image, setting --shm-size=4g for tensor parallel.
2121
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
22-
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
22+
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test
2323
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
24-
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
24+
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2-"$NUMA_NODE" cpu-test-avx2
2525

2626
function cpu_tests() {
2727
set -e
28+
export NUMA_NODE=$2
2829

2930
# offline inference
30-
docker exec cpu-test-avx2 bash -c "
31+
docker exec cpu-test-avx2-"$NUMA_NODE" bash -c "
3132
set -e
3233
python3 examples/offline_inference.py"
3334

3435
# Run basic model test
35-
docker exec cpu-test bash -c "
36+
docker exec cpu-test-"$NUMA_NODE" bash -c "
3637
set -e
3738
pip install pytest pytest-asyncio \
3839
decord einops librosa peft Pillow sentence-transformers soundfile \
@@ -45,20 +46,26 @@ function cpu_tests() {
4546
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
4647

4748
# Run compressed-tensor test
48-
docker exec cpu-test bash -c "
49+
docker exec cpu-test-"$NUMA_NODE" bash -c "
4950
set -e
5051
pytest -s -v \
5152
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
5253
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
5354

5455
# Run AWQ test
55-
docker exec cpu-test bash -c "
56+
docker exec cpu-test-"$NUMA_NODE" bash -c "
5657
set -e
5758
pytest -s -v \
5859
tests/quantization/test_ipex_quant.py"
5960

61+
# Run chunked-prefill and prefix-cache test
62+
docker exec cpu-test-"$NUMA_NODE" bash -c "
63+
set -e
64+
pytest -s -v -k cpu_model \
65+
tests/basic_correctness/test_chunked_prefill.py"
66+
6067
# online inference
61-
docker exec cpu-test bash -c "
68+
docker exec cpu-test-"$NUMA_NODE" bash -c "
6269
set -e
6370
export VLLM_CPU_KVCACHE_SPACE=10
6471
export VLLM_CPU_OMP_THREADS_BIND=$1
@@ -75,4 +82,4 @@ function cpu_tests() {
7582

7683
# All of CPU tests are expected to be finished less than 25 mins.
7784
export -f cpu_tests
78-
timeout 25m bash -c "cpu_tests $CORE_RANGE"
85+
timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"

0 commit comments

Comments
 (0)