Alexei-V-Ivanov-AMD
diff --git a/‎.clang-format
Lines changed: 26 additions & 0 deletions b/‎.clang-format
Lines changed: 26 additions & 0 deletions
diff --git a/‎.github/workflows/clang-format.yml
Lines changed: 42 additions & 0 deletions b/‎.github/workflows/clang-format.yml
Lines changed: 42 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 25 additions & 0 deletions b/‎README.md
Lines changed: 25 additions & 0 deletions
diff --git a/‎benchmarks/benchmark_serving.py
Lines changed: 4 additions & 0 deletions b/‎benchmarks/benchmark_serving.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎benchmarks/launch_tgi_server.sh
Lines changed: 1 addition & 1 deletion b/‎benchmarks/launch_tgi_server.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/activation_kernels.cu
Lines changed: 64 additions & 75 deletions b/‎csrc/activation_kernels.cu
Lines changed: 64 additions & 75 deletions
diff --git a/‎csrc/attention/attention_generic.cuh
Lines changed: 10 additions & 9 deletions b/‎csrc/attention/attention_generic.cuh
Lines changed: 10 additions & 9 deletions
@@ -0,0 +1,26 @@
+BasedOnStyle: Google
+UseTab: Never
+IndentWidth: 2
+ColumnLimit: 80
+
+# Force pointers to the type for C++.
+DerivePointerAlignment: false
+PointerAlignment: Left
+
+# Reordering #include statements can (and currently will) introduce errors
+SortIncludes: false
+
+# Style choices
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+IndentPPDirectives: BeforeHash
+
+IncludeCategories:
+  - Regex:           '^<'
+    Priority:        4
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/'
+    Priority:        3
+  - Regex:           '^"(qoda|\.\.)/'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        1
@@ -0,0 +1,42 @@
+name: clang-format
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  clang-format:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.11"]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install clang-format==18.1.5
+    - name: Running clang-format
+      run: |
+        EXCLUDES=(
+            'csrc/moe/topk_softmax_kernels.cu'
+            'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu'
+            'csrc/punica/bgmv/bgmv_config.h'
+            'csrc/punica/bgmv/bgmv_impl.cuh'
+            'csrc/punica/bgmv/vec_dtypes.cuh'
+            'csrc/punica/punica_ops.cu'
+            'csrc/punica/type_convert.h'
+        )
+        find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
+            | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
+            | xargs clang-format --dry-run --Werror
@@ -87,6 +87,31 @@ Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
 We welcome and value any contributions and collaborations.
 Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
 
+## Sponsors
+
+vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
+
+<!-- Note: Please sort them in alphabetical order. -->
+<!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
+
+- a16z
+- AMD
+- Anyscale
+- AWS
+- Crusoe Cloud
+- Databricks
+- DeepInfra
+- Lambda Lab
+- NVIDIA
+- Replicate
+- Roblox
+- RunPod
+- Trainy
+- UC Berkeley
+- UC San Diego
+
+We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
+
 ## Citation
 
 If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
 
@@ -17,6 +17,10 @@
         --dataset-path <path to dataset> \
         --request-rate <request_rate> \ # By default <request_rate> is inf
         --num-prompts <num_prompts> # By default <num_prompts> is 1000
+        
+    when using tgi backend, add
+        --endpoint /generate_stream
+    to the end of the command above.
 """
 import argparse
 import asyncio
 
@@ -4,7 +4,7 @@ PORT=8000
 MODEL=$1
 TOKENS=$2
 
-docker run --gpus all --shm-size 1g -p $PORT:80 \
+docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \
            -v $PWD/data:/data \
            ghcr.io/huggingface/text-generation-inference:1.4.0 \
            --model-id $MODEL \
 
@@ -10,11 +10,11 @@
 namespace vllm {
 
 // Activation and gating kernel template.
-template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
 __global__ void act_and_mul_kernel(
-  scalar_t* __restrict__ out,               // [..., d]
-  const scalar_t* __restrict__ input,       // [..., 2, d]
-  const int d) {
+    scalar_t* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., 2, d]
+    const int d) {
   const int64_t token_idx = blockIdx.x;
   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
     const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
@@ -23,139 +23,128 @@ __global__ void act_and_mul_kernel(
   }
 }
 
-template<typename T>
+template <typename T>
 __device__ __forceinline__ T silu_kernel(const T& x) {
   // x * sigmoid(x)
-  return (T) (((float) x) / (1.0f + expf((float) -x)));
+  return (T)(((float)x) / (1.0f + expf((float)-x)));
 }
 
-template<typename T>
+template <typename T>
 __device__ __forceinline__ T gelu_kernel(const T& x) {
   // Equivalent to PyTorch GELU with 'none' approximation.
   // Refer to:
   // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38
-  const float f = (float) x;
+  const float f = (float)x;
   constexpr float ALPHA = M_SQRT1_2;
-  return (T) (f * 0.5f * (1.0f + ::erf(f * ALPHA)));
+  return (T)(f * 0.5f * (1.0f + ::erf(f * ALPHA)));
 }
 
-template<typename T>
+template <typename T>
 __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
   // Equivalent to PyTorch GELU with 'tanh' approximation.
   // Refer to:
   // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30
-  const float f = (float) x;
+  const float f = (float)x;
   constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;
   constexpr float KAPPA = 0.044715;
   float x_cube = f * f * f;
   float inner = BETA * (f + KAPPA * x_cube);
-  return (T) (0.5f * f * (1.0f + ::tanhf(inner)));
+  return (T)(0.5f * f * (1.0f + ::tanhf(inner)));
 }
 
-} // namespace vllm
+}  // namespace vllm
 
 // Launch activation and gating kernel.
-#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL)                                             \
-  int d = input.size(-1) / 2;                                                             \
-  int64_t num_tokens = input.numel() / input.size(-1);                                    \
-  dim3 grid(num_tokens);                                                                  \
-  dim3 block(std::min(d, 1024));                                                          \
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));                       \
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                           \
-  VLLM_DISPATCH_FLOATING_TYPES(                                                           \
-    input.scalar_type(),                                                                  \
-    "act_and_mul_kernel",                                                                 \
-    [&] {                                                                                 \
-      vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>><<<grid, block, 0, stream>>>(   \
-        out.data_ptr<scalar_t>(),                                                         \
-        input.data_ptr<scalar_t>(),                                                       \
-        d);                                                                               \
-    });
-
-void silu_and_mul(
-  torch::Tensor& out,      // [..., d]
-  torch::Tensor& input)    // [..., 2 * d]
+#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL)                            \
+  int d = input.size(-1) / 2;                                            \
+  int64_t num_tokens = input.numel() / input.size(-1);                   \
+  dim3 grid(num_tokens);                                                 \
+  dim3 block(std::min(d, 1024));                                         \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));      \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();          \
+  VLLM_DISPATCH_FLOATING_TYPES(                                          \
+      input.scalar_type(), "act_and_mul_kernel", [&] {                   \
+        vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>>             \
+            <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),       \
+                                         input.data_ptr<scalar_t>(), d); \
+      });
+
+void silu_and_mul(torch::Tensor& out,    // [..., d]
+                  torch::Tensor& input)  // [..., 2 * d]
 {
   LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
 }
 
-void gelu_and_mul(
-  torch::Tensor& out,      // [..., d]
-  torch::Tensor& input)    // [..., 2 * d]
+void gelu_and_mul(torch::Tensor& out,    // [..., d]
+                  torch::Tensor& input)  // [..., 2 * d]
 {
   LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel);
 }
 
-void gelu_tanh_and_mul(
-  torch::Tensor& out,      // [..., d]
-  torch::Tensor& input)    // [..., 2 * d]
+void gelu_tanh_and_mul(torch::Tensor& out,    // [..., d]
+                       torch::Tensor& input)  // [..., 2 * d]
 {
   LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel);
 }
 
 namespace vllm {
 
 // Element-wise activation kernel template.
-template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
 __global__ void activation_kernel(
-  scalar_t* __restrict__ out,               // [..., d]
-  const scalar_t* __restrict__ input,       // [..., d]
-  const int d) {
+    scalar_t* __restrict__ out,          // [..., d]
+    const scalar_t* __restrict__ input,  // [..., d]
+    const int d) {
   const int64_t token_idx = blockIdx.x;
   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
     const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);
     out[token_idx * d + idx] = ACT_FN(x);
   }
 }
 
-} // namespace vllm
+}  // namespace vllm
 
 // Launch element-wise activation kernel.
-#define LAUNCH_ACTIVATION_KERNEL(KERNEL)                                                  \
-  int d = input.size(-1);                                                                 \
-  int64_t num_tokens = input.numel() / d;                                                 \
-  dim3 grid(num_tokens);                                                                  \
-  dim3 block(std::min(d, 1024));                                                          \
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));                       \
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                           \
-  VLLM_DISPATCH_FLOATING_TYPES(                                                           \
-    input.scalar_type(),                                                                  \
-    "activation_kernel",                                                                  \
-    [&] {                                                                                 \
-      vllm::activation_kernel<scalar_t, KERNEL<scalar_t>><<<grid, block, 0, stream>>>(    \
-        out.data_ptr<scalar_t>(),                                                         \
-        input.data_ptr<scalar_t>(),                                                       \
-        d);                                                                               \
-    });
+#define LAUNCH_ACTIVATION_KERNEL(KERNEL)                                       \
+  int d = input.size(-1);                                                      \
+  int64_t num_tokens = input.numel() / d;                                      \
+  dim3 grid(num_tokens);                                                       \
+  dim3 block(std::min(d, 1024));                                               \
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));            \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                \
+  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "activation_kernel", [&] { \
+    vllm::activation_kernel<scalar_t, KERNEL<scalar_t>>                        \
+        <<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(),                 \
+                                     input.data_ptr<scalar_t>(), d);           \
+  });
 
 namespace vllm {
 
-template<typename T>
+template <typename T>
 __device__ __forceinline__ T gelu_new_kernel(const T& x) {
-  const float x3 = (float) (x * x * x);
-  const T t = (T) tanhf((T) (0.79788456f * (float) (x + (T) (0.044715f * x3))));
-  return ((T) 0.5) * x * (((T) 1.0) + t);
+  const float x3 = (float)(x * x * x);
+  const T t = (T)tanhf((T)(0.79788456f * (float)(x + (T)(0.044715f * x3))));
+  return ((T)0.5) * x * (((T)1.0) + t);
 }
 
-template<typename T>
+template <typename T>
 __device__ __forceinline__ T gelu_fast_kernel(const T& x) {
-  const float f = (float) x;
-  const T t = (T) tanhf(((T) (f * 0.79788456f)) * (((T) 1.0) + (T) (0.044715f * f) * x));
-  return ((T) 0.5) * x * (((T) 1.0) + t);
+  const float f = (float)x;
+  const T t =
+      (T)tanhf(((T)(f * 0.79788456f)) * (((T)1.0) + (T)(0.044715f * f) * x));
+  return ((T)0.5) * x * (((T)1.0) + t);
 }
 
-} // namespace vllm
+}  // namespace vllm
 
-void gelu_new(
-  torch::Tensor& out,     // [..., d]
-  torch::Tensor& input)   // [..., d]
+void gelu_new(torch::Tensor& out,    // [..., d]
+              torch::Tensor& input)  // [..., d]
 {
   LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel);
 }
 
-void gelu_fast(
-  torch::Tensor& out,     // [..., d]
-  torch::Tensor& input)   // [..., d]
+void gelu_fast(torch::Tensor& out,    // [..., d]
+               torch::Tensor& input)  // [..., d]
 {
   LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
 }
@@ -1,5 +1,6 @@
 /*
- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
  * Copyright (c) 2023, The vLLM team.
  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
  *
@@ -22,31 +23,31 @@
 namespace vllm {
 
 // A vector type to store Q, K, V elements.
-template<typename T, int VEC_SIZE>
+template <typename T, int VEC_SIZE>
 struct Vec {};
 
 // A vector type to store FP32 accumulators.
-template<typename T>
+template <typename T>
 struct FloatVec {};
 
 // Template vector operations.
-template<typename Acc, typename A, typename B>
+template <typename Acc, typename A, typename B>
 inline __device__ Acc mul(A a, B b);
 
-template<typename T>
+template <typename T>
 inline __device__ float sum(T v);
 
-template<typename T>
+template <typename T>
 inline __device__ float dot(T a, T b) {
   return sum(mul<T, T, T>(a, b));
 }
 
-template<typename A, typename T>
+template <typename A, typename T>
 inline __device__ float dot(T a, T b) {
   return sum(mul<A, T, T>(a, b));
 }
 
-template<typename T>
+template <typename T>
 inline __device__ void zero(T& dst) {
   constexpr int WORDS = sizeof(T) / 4;
   union {
@@ -61,4 +62,4 @@ inline __device__ void zero(T& dst) {
   dst = tmp.raw;
 }
 
-} // namespace vllm
+}  // namespace vllm