Skip to content

Commit 7dcb71d

Browse files
Merge remote-tracking branch 'upstream/main'
2 parents acbf909 + 65ae8c2 commit 7dcb71d

File tree

609 files changed

+82783
-13428
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

609 files changed

+82783
-13428
lines changed

.buildkite/check-wheel-size.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import os
2+
import zipfile
3+
4+
MAX_SIZE_MB = 150
5+
6+
7+
def print_top_10_largest_files(zip_file):
8+
with zipfile.ZipFile(zip_file, 'r') as z:
9+
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
10+
file_sizes.sort(key=lambda x: x[1], reverse=True)
11+
for f, size in file_sizes[:10]:
12+
print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
13+
14+
15+
def check_wheel_size(directory):
16+
for root, _, files in os.walk(directory):
17+
for f in files:
18+
if f.endswith(".whl"):
19+
wheel_path = os.path.join(root, f)
20+
wheel_size = os.path.getsize(wheel_path)
21+
wheel_size_mb = wheel_size / (1024 * 1024)
22+
if wheel_size_mb > MAX_SIZE_MB:
23+
print(
24+
f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
25+
f"compare to the allowed size ({MAX_SIZE_MB} MB).")
26+
print_top_10_largest_files(wheel_path)
27+
return 1
28+
else:
29+
print(f"Wheel {wheel_path} is within the allowed size "
30+
f"({wheel_size_mb} MB).")
31+
return 0
32+
33+
34+
if __name__ == "__main__":
35+
import sys
36+
sys.exit(check_wheel_size(sys.argv[1]))

.buildkite/download-images.sh

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#!/bin/bash
2+
3+
set -ex
4+
set -o pipefail
5+
6+
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
7+
8+
# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
9+
mkdir -p images
10+
cd images
11+
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
12+
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
13+
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
14+
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
15+
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
16+
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
17+
18+
cd -

.buildkite/run-amd-test.sh

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# This script runs test inside the corresponding ROCm docker container.
2+
set -ex
3+
4+
# Print ROCm version
5+
echo "--- ROCm info"
6+
rocminfo
7+
8+
echo "--- Resetting GPUs"
9+
10+
echo "reset" > /opt/amdgpu/etc/gpu_state
11+
12+
while true; do
13+
sleep 3
14+
if grep -q clean /opt/amdgpu/etc/gpu_state; then
15+
echo "GPUs state is \"clean\""
16+
break
17+
fi
18+
done
19+
20+
echo "--- Building container"
21+
sha=$(git rev-parse --short HEAD)
22+
image_name=rocm_${sha}
23+
container_name=rocm_${sha}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)
24+
docker build \
25+
-t ${image_name} \
26+
-f Dockerfile.rocm \
27+
--progress plain \
28+
.
29+
30+
remove_docker_container() {
31+
docker rm -f ${container_name} || docker image rm -f ${image_name} || true
32+
}
33+
trap remove_docker_container EXIT
34+
35+
echo "--- Running container"
36+
37+
docker run \
38+
--device /dev/kfd --device /dev/dri \
39+
--network host \
40+
--rm \
41+
-e HF_TOKEN \
42+
--name ${container_name} \
43+
${image_name} \
44+
/bin/bash -c "${@}"
45+

.buildkite/run-benchmarks.sh

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,27 +6,32 @@ set -o pipefail
66
# cd into parent directory of this file
77
cd "$(dirname "${BASH_SOURCE[0]}")/.."
88

9-
(wget && curl) || (apt-get update && apt-get install -y wget curl)
9+
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
1010

11-
# run benchmarks and upload the result to buildkite
12-
python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
11+
# run python-based benchmarks and upload the result to buildkite
12+
python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
1313
bench_latency_exit_code=$?
1414

15-
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
15+
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
1616
bench_throughput_exit_code=$?
1717

18+
# run server-based benchmarks and upload the result to buildkite
1819
python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
1920
server_pid=$!
2021
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
2122

2223
# wait for server to start, timeout after 600 seconds
2324
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
2425
python3 benchmarks/benchmark_serving.py \
25-
--dataset ./ShareGPT_V3_unfiltered_cleaned_split.json \
26+
--backend vllm \
27+
--dataset-name sharegpt \
28+
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
2629
--model meta-llama/Llama-2-7b-chat-hf \
2730
--num-prompts 20 \
2831
--endpoint /v1/completions \
29-
--tokenizer meta-llama/Llama-2-7b-chat-hf 2>&1 | tee benchmark_serving.txt
32+
--tokenizer meta-llama/Llama-2-7b-chat-hf \
33+
--save-result \
34+
2>&1 | tee benchmark_serving.txt
3035
bench_serving_exit_code=$?
3136
kill $server_pid
3237

@@ -44,7 +49,14 @@ sed -n '$p' benchmark_throughput.txt >> benchmark_results.md # last line
4449
echo "### Serving Benchmarks" >> benchmark_results.md
4550
sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
4651
echo "" >> benchmark_results.md
47-
tail -n 5 benchmark_serving.txt >> benchmark_results.md # last 5 lines
52+
echo '```' >> benchmark_results.md
53+
tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
54+
echo '```' >> benchmark_results.md
55+
56+
# if the agent binary is not found, skip uploading the results, exit 0
57+
if [ ! -f /workspace/buildkite-agent ]; then
58+
exit 0
59+
fi
4860

4961
# upload the results to buildkite
5062
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
@@ -61,3 +73,6 @@ fi
6173
if [ $bench_serving_exit_code -ne 0 ]; then
6274
exit $bench_serving_exit_code
6375
fi
76+
77+
rm ShareGPT_V3_unfiltered_cleaned_split.json
78+
/workspace/buildkite-agent artifact upload "*.json"

.buildkite/run-cpu-test.sh

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# This script build the CPU docker image and run the offline inference inside the container.
2+
# It serves a sanity check for compilation and basic model usage.
3+
set -ex
4+
5+
# Try building the docker image
6+
docker build -t cpu-test -f Dockerfile.cpu .
7+
8+
# Setup cleanup
9+
remove_docker_container() { docker rm -f cpu-test || true; }
10+
trap remove_docker_container EXIT
11+
remove_docker_container
12+
13+
# Run the image and launch offline inference
14+
docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 examples/offline_inference.py

.buildkite/run-neuron-test.sh

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# This script build the Neuron docker image and run the API server inside the container.
2+
# It serves a sanity check for compilation and basic model usage.
3+
set -e
4+
5+
# Try building the docker image
6+
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
7+
8+
# prune old image and containers to save disk space, and only once a day
9+
# by using a timestamp file in tmp.
10+
if [ -f /tmp/neuron-docker-build-timestamp ]; then
11+
last_build=$(cat /tmp/neuron-docker-build-timestamp)
12+
current_time=$(date +%s)
13+
if [ $((current_time - last_build)) -gt 86400 ]; then
14+
docker system prune -f
15+
echo $current_time > /tmp/neuron-docker-build-timestamp
16+
fi
17+
else
18+
echo $(date +%s) > /tmp/neuron-docker-build-timestamp
19+
fi
20+
21+
docker build -t neuron -f Dockerfile.neuron .
22+
23+
# Setup cleanup
24+
remove_docker_container() { docker rm -f neuron || true; }
25+
trap remove_docker_container EXIT
26+
remove_docker_container
27+
28+
# Run the image
29+
docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
30+
--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
31+
32+
# Wait for the server to start
33+
wait_for_server_to_start() {
34+
timeout=300
35+
counter=0
36+
37+
while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
38+
sleep 1
39+
counter=$((counter + 1))
40+
if [ $counter -ge $timeout ]; then
41+
echo "Timeout after $timeout seconds"
42+
break
43+
fi
44+
done
45+
}
46+
wait_for_server_to_start
47+
48+
# Test a simple prompt
49+
curl -X POST -H "Content-Type: application/json" \
50+
localhost:8000/generate \
51+
-d '{"prompt": "San Francisco is a"}'

0 commit comments

Comments
 (0)