From 9c2367cefc4b497d67d76761854cb19fb7c2d4f7 Mon Sep 17 00:00:00 2001 From: Douglas Lehr Date: Fri, 26 Jan 2024 23:40:51 -0500 Subject: [PATCH 001/159] [ROCm] Fixup arch checks for ROCM The ROCM stack with PyTorch supports a wide set of gfx architectures. This can be displayed by printing PYTORCH_ROCM_ARCH env. In the absence of PYTORCH_ROCM_ARCH pytorch uses theoutput from rocm_agent_enumerator to choose what to compile for. vllm supports a subset of these, (gfx908, gfx90a,...) Due to a need to potentially support multiple architectures at once (ex. docker image) it's important to make sure vllm is compiled with them all unless specified otherwise. We now gather either the PYTORCH_ROCM_ARCH env or rocm_agent_enumerator output and cross reference with ROCM_SUPPORTED_ARCHS from vllm to generate a list of arches to build for. --- Dockerfile.rocm | 3 -- setup.py | 83 ++++++++++++++++++++++++++++++++----------------- 2 files changed, 55 insertions(+), 31 deletions(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 88172fb73b937..3c76305303037 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -10,9 +10,6 @@ RUN echo "Base image is $BASE_IMAGE" # BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" # BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" -# this does not always work for all rocm versions -RUN LLVM_GFX_ARCH=$(/opt/rocm/llvm/bin/amdgpu-offload-arch) && \ - echo "LLVM_GFX_ARCH is $LLVM_GFX_ARCH" ARG FA_GFX_ARCHS="gfx90a;gfx942" RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS" diff --git a/setup.py b/setup.py index 88fa495205659..25b460fdc6cc4 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ # Supported NVIDIA GPU architectures. NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"} -ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx1030", "gfx1100"} +ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx942", "gfx1030", "gfx1100"} # SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS) @@ -63,21 +63,6 @@ def _is_cuda() -> bool: NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] -def get_amdgpu_offload_arch(): - command = "/opt/rocm/llvm/bin/amdgpu-offload-arch" - try: - output = subprocess.check_output([command]) - return output.decode('utf-8').strip() - except subprocess.CalledProcessError as e: - error_message = f"Error: {e}" - raise RuntimeError(error_message) from e - except FileNotFoundError as e: - # If the command is not found, print an error message - error_message = f"The command {command} was not found." - raise RuntimeError(error_message) from e - - return None - def get_hipcc_rocm_version(): # Run the hipcc --version command @@ -138,6 +123,49 @@ def get_nvcc_cuda_version(cuda_dir: str) -> Version: return nvcc_cuda_version +def get_pytorch_rocm_arch() -> Set[str]: + """Get the cross section of Pytorch,and vllm supported gfx arches + + ROCM can get the supported gfx architectures in one of two ways + Either through the PYTORCH_ROCM_ARCH env var, or output from + rocm_agent_enumerator. + + In either case we can generate a list of supported arch's and + cross reference with VLLM's own ROCM_SUPPORTED_ARCHs. + """ + env_arch_list = os.environ.get("PYTORCH_ROCM_ARCH", None) + + # If we don't have PYTORCH_ROCM_ARCH specified pull the list from rocm_agent_enumerator + if env_arch_list is None: + command = "rocm_agent_enumerator" + env_arch_list = subprocess.check_output([command]).decode('utf-8')\ + .strip().replace("\n", ";") + arch_source_str = "rocm_agent_enumerator" + else: + arch_source_str = "PYTORCH_ROCM_ARCH env variable" + + # List are separated by ; or space. + pytorch_rocm_arch = set(env_arch_list.replace(" ", ";").split(";")) + + # Filter out the invalid architectures and print a warning. + arch_list = pytorch_rocm_arch.intersection(ROCM_SUPPORTED_ARCHS) + + # If none of the specified architectures are valid, raise an error. + if not arch_list: + raise RuntimeError( + f"None of the ROCM architectures in {arch_source_str} " + f"({env_arch_list}) is supported. " + f"Supported ROCM architectures are: {ROCM_SUPPORTED_ARCHS}.") + invalid_arch_list = pytorch_rocm_arch - ROCM_SUPPORTED_ARCHS + if invalid_arch_list: + warnings.warn( + f"Unsupported ROCM architectures ({invalid_arch_list}) are " + f"excluded from the {arch_source_str} output " + f"({env_arch_list}). Supported ROCM architectures are: " + f"{ROCM_SUPPORTED_ARCHS}.", + stacklevel=2) + return arch_list + def get_torch_arch_list() -> Set[str]: # TORCH_CUDA_ARCH_LIST can have one or more architectures, # e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the @@ -162,22 +190,27 @@ def get_torch_arch_list() -> Set[str]: # If none of the specified architectures are valid, raise an error. if not arch_list: raise RuntimeError( - "None of the CUDA/ROCM architectures in `TORCH_CUDA_ARCH_LIST` env " + "None of the CUDA architectures in `TORCH_CUDA_ARCH_LIST` env " f"variable ({env_arch_list}) is supported. " - f"Supported CUDA/ROCM architectures are: {valid_archs}.") + f"Supported CUDA architectures are: {valid_archs}.") invalid_arch_list = torch_arch_list - valid_archs if invalid_arch_list: warnings.warn( - f"Unsupported CUDA/ROCM architectures ({invalid_arch_list}) are " + f"Unsupported CUDA architectures ({invalid_arch_list}) are " "excluded from the `TORCH_CUDA_ARCH_LIST` env variable " - f"({env_arch_list}). Supported CUDA/ROCM architectures are: " + f"({env_arch_list}). Supported CUDA architectures are: " f"{valid_archs}.", stacklevel=2) return arch_list -# First, check the TORCH_CUDA_ARCH_LIST environment variable. -compute_capabilities = get_torch_arch_list() +if _is_hip(): + rocm_arches = get_pytorch_rocm_arch() + NVCC_FLAGS += ["--offload-arch=" + arch for arch in rocm_arches] +else: + # First, check the TORCH_CUDA_ARCH_LIST environment variable. + compute_capabilities = get_torch_arch_list() + if _is_cuda() and not compute_capabilities: # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available # GPUs on the current machine. @@ -283,12 +316,6 @@ def get_torch_arch_list() -> Set[str]: "nvcc": NVCC_FLAGS_PUNICA, }, )) -elif _is_hip(): - amd_arch = get_amdgpu_offload_arch() - if amd_arch not in ROCM_SUPPORTED_ARCHS: - raise RuntimeError( - f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}" - f"amdgpu_arch_found: {amd_arch}") elif _is_neuron(): neuronxcc_version = get_neuronxcc_version() From a9d752c7be6686000ace68fe2bc49e3845fc7e8e Mon Sep 17 00:00:00 2001 From: Douglas Lehr Date: Sat, 27 Jan 2024 12:44:19 -0500 Subject: [PATCH 002/159] yapf cleanup --- setup.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 25b460fdc6cc4..15b9a78f6ca27 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,9 @@ # Supported NVIDIA GPU architectures. NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"} -ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx942", "gfx1030", "gfx1100"} +ROCM_SUPPORTED_ARCHS = { + "gfx90a", "gfx908", "gfx906", "gfx942", "gfx1030", "gfx1100" +} # SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS) @@ -63,7 +65,6 @@ def _is_cuda() -> bool: NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] - def get_hipcc_rocm_version(): # Run the hipcc --version command result = subprocess.run(['hipcc', '--version'], @@ -166,6 +167,7 @@ def get_pytorch_rocm_arch() -> Set[str]: stacklevel=2) return arch_list + def get_torch_arch_list() -> Set[str]: # TORCH_CUDA_ARCH_LIST can have one or more architectures, # e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the From ad53f74b6d05cb3b1b97f66e490e99ac745fde31 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Mon, 5 Feb 2024 15:47:56 +0000 Subject: [PATCH 003/159] Add hip_fp8 datatype and conversions Add non-MI300 compatible alternative for bulk conversions Removed bf8 (e5m2) and renamed f8 to fp8 to explicitly specify that it is e4m3 Removed stochastic rounding for simplicity Put bulk fp8 conversion hip intrinsics behind a define. Disabled by default Using types from the proper vllm headers. Added namespace Move amd specific headers under amd_detail --- csrc/quantization/fp8/amd_detail/hip_float8.h | 167 +++++++++ .../fp8/amd_detail/hip_float8_impl.h | 316 ++++++++++++++++++ .../fp8/amd_detail/quant_utils.cuh | 293 ++++++++++++++++ 3 files changed, 776 insertions(+) create mode 100644 csrc/quantization/fp8/amd_detail/hip_float8.h create mode 100644 csrc/quantization/fp8/amd_detail/hip_float8_impl.h create mode 100644 csrc/quantization/fp8/amd_detail/quant_utils.cuh diff --git a/csrc/quantization/fp8/amd_detail/hip_float8.h b/csrc/quantization/fp8/amd_detail/hip_float8.h new file mode 100644 index 0000000000000..87c7c9ce66100 --- /dev/null +++ b/csrc/quantization/fp8/amd_detail/hip_float8.h @@ -0,0 +1,167 @@ +#pragma once + +#ifdef __HIPCC__ +#include +#else +#include +#include +#include +#include +#endif + +#include "hip_float8_impl.h" + +struct alignas(1) hip_fp8 +{ + struct from_bits_t + { + }; + HIP_FP8_HOST_DEVICE static constexpr from_bits_t from_bits() { return from_bits_t(); } + uint8_t data; + + hip_fp8() = default; + HIP_FP8_HOST_DEVICE constexpr hip_fp8(const hip_fp8&) = default; + HIP_FP8_HOST_DEVICE constexpr hip_fp8(uint8_t v) = delete; + explicit HIP_FP8_HOST_DEVICE constexpr hip_fp8(uint8_t v, from_bits_t) + : data(v) + { + } + +#ifdef __HIP__MI300__ + // NOTE: ON-DEVICE... always optimal bias + explicit HIP_FP8_DEVICE hip_fp8(float v) + : data(hip_fp8_impl::to_fp8_from_fp32(v)) + { + } + + explicit HIP_FP8_DEVICE hip_fp8(_Float16 v) + : hip_fp8(static_cast(v)) + { + } + + // Host only implementation using s/w simulation + explicit HIP_FP8_HOST +#else // __HIP__MI300__ + // both Host and DEVICE for non-MI300 using s/w simulation + explicit HIP_FP8_HOST_DEVICE +#endif // __HIP__MI300__ + hip_fp8(float v) + { + data = hip_fp8_impl::to_float8<4, 3, float, true /*negative_zero_nan*/, true /*clip*/>(v); + } + + explicit HIP_FP8_HOST_DEVICE hip_fp8(double v) + : hip_fp8(static_cast(v)) + { + } + +#ifdef __HIP__MI300__ + // upcast using device specific intrinsic + explicit inline HIP_FP8_DEVICE operator float() const + { + float fval; + uint32_t i32val = static_cast(data); + + // upcast + asm volatile("v_cvt_f32_fp8 %0, %1 src0_sel:BYTE_0" : "=v"(fval) : "v"(i32val)); + + return fval; + } + + explicit inline HIP_FP8_HOST operator float() const +#else // __HIP__MI300__ + explicit inline HIP_FP8_HOST_DEVICE operator float() const +#endif // __HIP__MI300__ + { + return hip_fp8_impl::from_float8<4, 3, float, true /*negative_zero_nan*/>(data); + } +}; + +namespace std +{ +inline hip_fp8 sin(hip_fp8 a) +{ + return hip_fp8(sinf(float(a))); +} +inline hip_fp8 cos(hip_fp8 a) +{ + return hip_fp8(cosf(float(a))); +} +HIP_FP8_HOST_DEVICE constexpr hip_fp8 real(const hip_fp8& a) +{ + return a; +} +} // namespace std + +// Special operator overloading +inline std::ostream& operator<<(std::ostream& os, const hip_fp8& f8) +{ + return os << float(f8); +} + +// all + operator overloading with mixed types +// mixed types, always converts to f32, does computation in f32, and returns float +inline HIP_FP8_HOST_DEVICE float operator+(const float fa, hip_fp8 b) +{ + return (fa + float(b)); +} + +inline HIP_FP8_HOST_DEVICE float operator+(hip_fp8 a, const float fb) +{ + return (float(a) + fb); +} + +inline HIP_FP8_HOST_DEVICE hip_fp8 operator+(hip_fp8 a, hip_fp8 b) +{ + return hip_fp8(float(a) + float(b)); +} + +inline HIP_FP8_HOST_DEVICE hip_fp8& operator+=(hip_fp8& a, hip_fp8 b) +{ + return a = hip_fp8(float(a) + float(b)); +} + +// overloading multiplication, always returns float, +inline HIP_FP8_HOST_DEVICE float operator*(hip_fp8 a, hip_fp8 b) +{ + return float(a) * float(b); +} + +inline HIP_FP8_HOST_DEVICE float operator*(float a, hip_fp8 b) +{ + return (a * float(b)); +} + +inline HIP_FP8_HOST_DEVICE float operator*(hip_fp8 a, float b) +{ + return (float(a) * b); +} + +inline HIP_FP8_HOST_DEVICE float operator*(int32_t a, hip_fp8 b) +{ + return ((float)a * float(b)); +} + +inline HIP_FP8_HOST_DEVICE float operator*(double a, hip_fp8 b) +{ + return ((float)a * float(b)); +} + +// overloading for compare +inline HIP_FP8_HOST_DEVICE bool operator==(hip_fp8 a, hip_fp8 b) +{ + return (a.data == b.data); +} +inline HIP_FP8_HOST_DEVICE bool operator!=(hip_fp8 a, hip_fp8 b) +{ + return (a.data != b.data); +} + +inline HIP_FP8_HOST_DEVICE bool operator>=(hip_fp8 a, hip_fp8 b) +{ + return static_cast(a) >= static_cast(b); +} +inline HIP_FP8_HOST_DEVICE bool operator>(hip_fp8 a, hip_fp8 b) +{ + return static_cast(a) > static_cast(b); +} diff --git a/csrc/quantization/fp8/amd_detail/hip_float8_impl.h b/csrc/quantization/fp8/amd_detail/hip_float8_impl.h new file mode 100644 index 0000000000000..c88fbd913c2ee --- /dev/null +++ b/csrc/quantization/fp8/amd_detail/hip_float8_impl.h @@ -0,0 +1,316 @@ +#pragma once + +#if defined(__HIPCC__) && (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) +#define __HIP__MI300__ +#endif + +#ifdef __HIPCC__ +#define HIP_FP8_HOST_DEVICE __host__ __device__ +#define HIP_FP8_HOST __host__ +#define HIP_FP8_DEVICE __device__ +#else +#define HIP_FP8_HOST_DEVICE +#define HIP_FP8_HOST +#define HIP_FP8_DEVICE +#endif + +namespace hip_fp8_impl +{ + +#ifdef __HIP__MI300__ +HIP_FP8_DEVICE uint8_t to_fp8_from_fp32(float v) +{ + uint8_t i8data; + union { + float fval; + uint32_t i32val; + uint8_t i8val[4]; // NOTE: not endian independent + } val; + + uint32_t ival = 0; + val.fval = v; + + if ((val.i32val & 0x7F800000) != 0x7F800000) { /// propagate NAN/INF, no clipping + val.fval = __builtin_amdgcn_fmed3f(val.fval, 240.0, -240.0); + } + + ival = __builtin_amdgcn_cvt_pk_fp8_f32(val.fval, val.fval, ival, + false); // false -> WORD0 + val.i32val = ival; + i8data = val.i8val[0]; + + return i8data; +} +#endif // __HIP__MI300__ + +HIP_FP8_HOST inline int clz(uint32_t x) +{ + return __builtin_clz(x); +} +#if defined(__HIPCC__) || defined(__CUDA_ARCH__) +HIP_FP8_DEVICE inline int clz(uint32_t x) +{ + return __clz(x); +} +#endif + +template +HIP_FP8_HOST_DEVICE uint8_t to_float8(T _x, bool stoch = false, uint32_t rng = 0) +{ +#ifdef __HIPCC__ + constexpr bool is_half = std::is_same::value; +#else + constexpr bool is_half = false; +#endif + constexpr bool is_float = std::is_same::value; + static_assert(wm + we == 7, "wm+we==7"); + static_assert(is_half || is_float, "Only half and float can be cast to f8"); + + const int mfmt = (sizeof(T) == 4) ? 23 : 10; + uint32_t x; + if (sizeof(T) == 4) { + x = reinterpret_cast(_x); + } else { + x = reinterpret_cast(_x); + } + + uint32_t head, mantissa; + int exponent, bias; + uint32_t sign; + + if (sizeof(T) == 4) { + head = x & 0xFF800000; + mantissa = x & 0x7FFFFF; + exponent = (head >> 23) & 0xFF; + sign = head >> 31; + bias = 127; + } else { + head = x & 0xFC00; + mantissa = x & 0x3FF; + exponent = (head >> 10) & 0x1F; + sign = head >> 15; + bias = 15; + } + + uint32_t signed_inf = (sign << 7) + (((1 << we) - 1) << wm); + + // Deal with inf and NaNs + if (negative_zero_nan) { + if (sizeof(T) == 4) { + if ((x & 0x7F800000) == 0x7F800000) { + return 0x80; + } + } else { + // if(__hisinf(x) || __hisnan(x)) + if ((x & 0x7C00) == 0x7C00) { + return 0x80; + } + } + } else { + if (sizeof(T) == 4) { + if ((x & 0x7F800000) == 0x7F800000) { + return signed_inf + (mantissa != 0 ? 1 : 0); + } + } else { + if ((x & 0x7C00) == 0x7C00) { + return signed_inf + (mantissa != 0 ? 1 : 0); + } + } + } + if (x == 0) { + return 0; + } + + // First need to check if it is normal or denorm as there is a difference of + // implict 1 Then need to adjust the exponent to align with the F8 exponent, + // in the meanwhile, shift The mantissa. Then for stochastic rounding, add rng + // to mantissa and truncate. And for RNE, no need to add rng. Then probably + // need to check whether there is carry and adjust exponent and mantissa again + + // For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent + // bits + const int f8_bias = (1 << (we - 1)) - 1 + (negative_zero_nan ? 1 : 0); + const int f8_denormal_act_exponent = 1 - f8_bias; // actual exponent of f8 denormal + // act_exponent is the actual exponent of fp32/fp16 (after subtracting bias) + // f8_exponent is the converted f8 exponent with bias encoding + // exponent_diff is the diff between fp32/fp16 exponent and f8 exponent, + // the difference needs to be adjusted and mantissa shifted + int act_exponent, f8_exponent, exponent_diff; + + if (exponent == 0) { // fp32/fp16 is in denormal. + /* fp32 denormal is below 2^-127 so it is usually not a concern here, we +mostly concern fp16 here. In this case, f8 is usually in denormal. But there +could be exceptions. fp16 denormal has exponent bias 15 while bf8 with NANOO has +exponent bias 16. It means that there are some numbers in fp16 denormal but they +are bf8 (NANOO) normals - smallest bf8 (NANOO) normal is 2^-15. fp16 numbers +where exponent==0 (actual exponent -14) and highest bit of mantissa is 1 are bf8 +(NANOO) normal. In this case, the fp16 mantissa should be shift left by 1 */ + act_exponent = exponent - bias + 1; + exponent_diff = f8_denormal_act_exponent - act_exponent; // actual exponent is exponent-bias+1 as it is denormal + } else { // fp32/fp16 is normal with implicit 1 + act_exponent = exponent - bias; + if (act_exponent <= f8_denormal_act_exponent) { + /* This is the case where fp32/fp16 is normal but it is in f8 denormal + range. For example fp8 nanoo mode, denormal exponent is -7, but if the + fp32/fp16 actual exponent is -7, it is actually larger due to the implict 1, + Therefore it needs to be adjust to -6 and mantissa shift right by 1. + So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */ + exponent_diff = f8_denormal_act_exponent - act_exponent; + } else { // both fp32/fp16 and f8 are in normal range + exponent_diff = 0; // exponent_diff=0 does not mean there is no difference + // for this case, + // act_exponent could be larger. Just that it does not need shift mantissa + } + mantissa += (1 << mfmt); // Add the implicit 1 into mantissa + } + + bool midpoint = (mantissa & ((1 << (mfmt - wm + exponent_diff)) - 1)) == + static_cast(1 << (mfmt - wm + exponent_diff - 1)); + /* This part is a bit tricky. The judgment of whether it is a tie needs to be + done before we shift right as shift right could rip off some residual part + and make something not midpoint look like midpoint. For example, the fp16 + number 0x1002 (0 00100 0000000010), it is larger than midpoint, but after + shift right by 4 bits, it would look like midpoint. +*/ + + if (exponent_diff > 0) { + mantissa >>= exponent_diff; + } else if (exponent_diff == -1) { + mantissa <<= -exponent_diff; + } + bool implicit_one = mantissa & (1 << mfmt); + // if there is no implict 1, it means the f8 is denormal and need to adjust + // to denorm exponent + f8_exponent = (act_exponent + exponent_diff) /*actual f8 exponent*/ + f8_bias - (implicit_one ? 0 : 1); + + // Now we have the exponent and mantissa adjusted + uint32_t drop_mask = (1 << (mfmt - wm)) - 1; + bool odd = mantissa & (1 << (mfmt - wm)); // if the least significant bit that + // is not truncated is 1 + mantissa += (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1) : mantissa)) & drop_mask; + + // Now we deal with overflow + if (f8_exponent == 0) { + if ((1 << mfmt) & mantissa) { + f8_exponent = 1; // denormal overflow to become normal, promote exponent + } + } else { + if ((1 << (mfmt + 1)) & mantissa) { + mantissa >>= 1; + f8_exponent++; + } + } + + mantissa >>= (mfmt - wm); + + // above range: quantize to maximum possible float of the same sign + const int max_exp = (1 << we) - (negative_zero_nan ? 1 : 2); + if (f8_exponent > max_exp) { + if (clip) { + mantissa = (1 << wm) - 1; + f8_exponent = max_exp; + } else { + return signed_inf; + } + } + + if (f8_exponent == 0 && mantissa == 0) { + return negative_zero_nan ? 0 : (sign << 7); + } + mantissa &= (1 << wm) - 1; + return (sign << 7) | (f8_exponent << wm) | mantissa; +} + +template +inline HIP_FP8_HOST_DEVICE T from_float8(uint8_t x) +{ +#ifdef __HIPCC__ + constexpr bool is_half = std::is_same::value; +#else + constexpr bool is_half = false; +#endif + constexpr bool is_float = std::is_same::value; + static_assert(is_half || is_float, "only half and float are supported"); + + constexpr int weo = is_half ? 5 : 8; + constexpr int wmo = is_half ? 10 : (is_float ? 23 : 7); + + T fInf, fNegInf, fNaN, fNeg0; + +#ifdef __HIPCC__ + if (is_half) { + const uint16_t ihInf = 0x7C00; + const uint16_t ihNegInf = 0xFC00; + const uint16_t ihNaN = 0x7C01; + const uint16_t ihNeg0 = 0x8000; + fInf = reinterpret_cast(ihInf); + fNegInf = reinterpret_cast(ihNegInf); + fNaN = reinterpret_cast(ihNaN); + fNeg0 = reinterpret_cast(ihNeg0); + } else +#endif + if (is_float) { + const uint32_t ifInf = 0x7F800000; + const uint32_t ifNegInf = 0xFF800000; + const uint32_t ifNaN = 0x7F800001; + const uint32_t ifNeg0 = 0x80000000; + fInf = reinterpret_cast(ifInf); + fNegInf = reinterpret_cast(ifNegInf); + fNaN = reinterpret_cast(ifNaN); + fNeg0 = reinterpret_cast(ifNeg0); + } + + if (x == 0) { + return 0; + } + + uint32_t sign = x >> 7; + uint32_t mantissa = x & ((1 << wm) - 1); + int exponent = (x & 0x7F) >> wm; + if (negative_zero_nan) { + if (x == 0x80) { + return fNaN; + } + } else { + if (x == 0x80) { + return fNeg0; + } + if (exponent == ((1 << we) - 1)) { + return (mantissa == 0) ? (sign ? fNegInf : fInf) : fNaN; + } + } + typename std::conditional::type retval; + if (we == 5 && is_half && !negative_zero_nan) { + retval = x << 8; + return reinterpret_cast(retval); + } + + const int exp_low_cutoff = (1 << (weo - 1)) - (1 << (we - 1)) + 1 - (negative_zero_nan ? 1 : 0); + + // subnormal input + if (exponent == 0) { + // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above + int sh = 1 + clz(mantissa) - (32 - wm); + mantissa <<= sh; + exponent += 1 - sh; + mantissa &= ((1 << wm) - 1); + } + exponent += exp_low_cutoff - 1; + mantissa <<= wmo - wm; + + // subnormal output (occurs when T=half, we=5, negative_zero_nan=true) + if (exponent <= 0) { + mantissa |= 1 << wmo; + mantissa >>= 1 - exponent; + exponent = 0; + } + + if (sizeof(T) == 2) { + retval = (sign << 15) | (exponent << 10) | mantissa; + } else { + retval = (sign << 31) | (exponent << 23) | mantissa; + } + return reinterpret_cast(retval); +} + +} // namespace hip_fp8_impl diff --git a/csrc/quantization/fp8/amd_detail/quant_utils.cuh b/csrc/quantization/fp8/amd_detail/quant_utils.cuh new file mode 100644 index 0000000000000..afd37cc9da0d3 --- /dev/null +++ b/csrc/quantization/fp8/amd_detail/quant_utils.cuh @@ -0,0 +1,293 @@ +#pragma once +#include "hip_float8.h" + +#include +#include +#include + +#include "../../../attention/dtype_float32.cuh" +#include "../../../attention/dtype_bfloat16.cuh" + +namespace vllm +{ + +template +__inline__ __device__ Tout vec_conversion(const Tin& x) +{ + return x; +} + +// fp8 -> half +template <> +__inline__ __device__ uint16_t vec_conversion(const uint8_t& a) +{ + hip_fp8 f8{a, hip_fp8::from_bits()}; + __half_raw res; + res.data = static_cast(f8); + return res.x; +} + +// fp8x2 -> half2 +template <> +__inline__ __device__ uint32_t vec_conversion(const uint16_t& a) +{ +#if defined(__HIP__MI300__) && defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__) + const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0); + union { + __half2_raw h2r; + uint32_t ui32; + } tmp; + tmp.h2r.x.data = f2[0]; + tmp.h2r.y.data = f2[1]; + return tmp.ui32; +#else + union { + uint16_t u16[2]; + uint32_t u32; + } tmp; + + tmp.u16[0] = vec_conversion(static_cast(a)); + tmp.u16[1] = vec_conversion(static_cast(a >> 8U)); + return tmp.u32; +#endif +} + +// fp8x4 -> half2x2 +template <> +__inline__ __device__ uint2 vec_conversion(const uint32_t& a) +{ + union { + uint2 u32x2; + uint32_t u32[2]; + } tmp; + tmp.u32[0] = vec_conversion((uint16_t)a); + tmp.u32[1] = vec_conversion((uint16_t)(a >> 16U)); + return tmp.u32x2; +} + +// fp8x8 -> half2x4 +template <> +__inline__ __device__ uint4 vec_conversion(const uint2& a) +{ + union { + uint4 u64x2; + uint2 u64[2]; + } tmp; + tmp.u64[0] = vec_conversion(a.x); + tmp.u64[1] = vec_conversion(a.y); + return tmp.u64x2; +} + +using __nv_bfloat16 = __hip_bfloat16; + +// fp8 -> __nv_bfloat16 +template <> +__inline__ __device__ __nv_bfloat16 vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a) +{ + hip_fp8 f8{a, hip_fp8::from_bits()}; + float f{f8}; + return __float2bfloat16(f); +} + +using __nv_bfloat162 = __hip_bfloat162; + +// fp8x2 -> __nv_bfloat162 +template <> +__inline__ __device__ __nv_bfloat162 vec_conversion<__nv_bfloat162, uint16_t>(const uint16_t& a) +{ + __nv_bfloat162 res; + res.x = vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)a); + res.y = vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)(a >> 8U)); + return res; +} + +// fp8x4 -> bf16_4_t +template <> +__inline__ __device__ bf16_4_t vec_conversion(const uint32_t& a) +{ + bf16_4_t res; + res.x = vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)a); + res.y = vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)(a >> 16U)); + return res; +} + +// fp8x8 -> bf16_8_t +template <> +__inline__ __device__ bf16_8_t vec_conversion(const uint2& a) +{ + bf16_4_t tmp1, tmp2; + tmp1 = vec_conversion(a.x); + tmp2 = vec_conversion(a.y); + bf16_8_t res; + res.x = tmp1.x; + res.y = tmp1.y; + res.z = tmp2.x; + res.w = tmp2.y; + return res; +} + +// fp8 -> float +template <> +__inline__ __device__ float vec_conversion(const uint8_t& a) +{ + hip_fp8 fp8{a, hip_fp8::from_bits()}; + return static_cast(fp8); +} + +// fp8x2 -> float2 +template <> +__inline__ __device__ float2 vec_conversion(const uint16_t& a) +{ +#if defined(__HIP__MI300__) && defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__) + float2 res; + const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0); + //res.x = vec_conversion(static_cast(a)); + //res.y = vec_conversion(static_cast(a >> 8U)); + res.x = f2[0]; + res.y = f2[1]; + return res; +#else + float2 res; + res.x = vec_conversion(static_cast(a)); + res.y = vec_conversion(static_cast(a >> 8U)); + return res; +#endif +} + +// fp8x4 -> float4 +template <> +__inline__ __device__ Float4_ vec_conversion(const uint32_t& a) +{ + Float4_ res; + res.x = vec_conversion((uint16_t)a); + res.y = vec_conversion((uint16_t)(a >> 16U)); + return res; +} + +// fp8x8 -> float8 +template <> +__inline__ __device__ Float8_ vec_conversion(const uint2& a) +{ + Float4_ tmp1, tmp2; + tmp1 = vec_conversion(a.x); + tmp2 = vec_conversion(a.y); + Float8_ res; + res.x = tmp1.x; + res.y = tmp1.y; + res.z = tmp2.x; + res.w = tmp2.y; + return res; +} + +// half -> fp8 +template <> +__inline__ __device__ uint8_t vec_conversion(const uint16_t& a) +{ + __half_raw tmp; + tmp.x = a; + + hip_fp8 f8{static_cast(tmp.data)}; + return f8.data; +} + +// bf16 -> fp8 +template <> +__inline__ __device__ uint8_t vec_conversion(const __nv_bfloat16& a) +{ + hip_fp8 res{__bfloat162float(a)}; + return res.data; +} + +// float -> fp8 +template <> +__inline__ __device__ uint8_t vec_conversion(const float& a) +{ + hip_fp8 f8(a); + return f8.data; +} + +// fp8x4 -> float4 +template <> +__inline__ __device__ float4 vec_conversion(const uint32_t& a) +{ + Float4_ tmp = vec_conversion(a); + float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y); + return res; +} + +template <> +__inline__ __device__ uint32_t vec_conversion(const float2& a) +{ + union { + half2 float16; + uint32_t uint32; + }; + + float16 = __float22half2_rn(a); + return uint32; +} + +template <> +__inline__ __device__ uint2 vec_conversion(const Float4_& a) +{ + uint2 b; + float2 val; + val.x = a.x.x; + val.y = a.x.y; + b.x = vec_conversion(val); + + val.x = a.y.x; + val.y = a.y.y; + b.y = vec_conversion(val); + return b; +} + +template <> +__inline__ __device__ float4 vec_conversion(const Float4_& a) +{ + float4 b; + b.x = a.x.x; + b.y = a.x.y; + b.z = a.y.x; + b.w = a.y.y; + return b; +} + +template <> +__inline__ __device__ uint4 vec_conversion(const Float8_& a) +{ + uint4 b; + b.x = vec_conversion(a.x); + b.y = vec_conversion(a.y); + b.z = vec_conversion(a.z); + b.w = vec_conversion(a.w); + return b; +} + +template <> +__inline__ __device__ __nv_bfloat162 vec_conversion<__nv_bfloat162, float2>(const float2& a) +{ + __nv_bfloat162 b = __float22bfloat162_rn(a); + return b; +} + +template <> +__inline__ __device__ bf16_4_t vec_conversion(const Float4_& a) +{ + bf16_4_t b; + b.x = __float22bfloat162_rn(a.x); + b.y = __float22bfloat162_rn(a.y); + return b; +} + +template <> +__inline__ __device__ bf16_8_t vec_conversion(const Float8_& a) +{ + bf16_8_t b; + b.x = __float22bfloat162_rn(a.x); + b.y = __float22bfloat162_rn(a.y); + b.z = __float22bfloat162_rn(a.z); + b.w = __float22bfloat162_rn(a.w); + return b; +} +} // namespace vllm From 9b1577a4cdf0f979173a0ccf09a308a7284cf74d Mon Sep 17 00:00:00 2001 From: HaiShaw Date: Mon, 5 Feb 2024 17:03:24 -0800 Subject: [PATCH 004/159] Add 3rdparty quantizer utility and usage to quantize models (HF default) --- 3rdparty/README.md | 8 ++ 3rdparty/quantizer/hf_ptq.py | 211 +++++++++++++++++++++++++++++++++++ 2 files changed, 219 insertions(+) create mode 100644 3rdparty/README.md create mode 100644 3rdparty/quantizer/hf_ptq.py diff --git a/3rdparty/README.md b/3rdparty/README.md new file mode 100644 index 0000000000000..afa2a9d4c7657 --- /dev/null +++ b/3rdparty/README.md @@ -0,0 +1,8 @@ +### quantizer utilities +`quantizer/hf_ptq.py`: Quantization utilities from AMMO and/or TensorRT-LLM, usage embedded at top + +### AMMO (AlgorithMic Model Optimization) Installation +`pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo` + +### AMMO Download +`https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.5.0.tar.gz` diff --git a/3rdparty/quantizer/hf_ptq.py b/3rdparty/quantizer/hf_ptq.py new file mode 100644 index 0000000000000..5597ba9991e49 --- /dev/null +++ b/3rdparty/quantizer/hf_ptq.py @@ -0,0 +1,211 @@ +# with AMMO installed, do below: +# python hf_ptq.py --pyt_ckpt_path="./ll2-7b" --export_path=ll2_7b_ptq_fp8 --qformat=fp8 --calib_size=128 --inference_gpus=1 +# python hf_ptq.py --pyt_ckpt_path= \ +# --export_path=llama_ptq \ +# --qformat=fp8 \ +# --calib_size=128 \ +# --inference_gpus=1 +# +# with TensorRT-LLM is installed, similarly do below: +# /dockerx/TensorRT-LLM/examples/quantization# python quantize.py --model_dir /dockerx/ll2-7b --dtype float16 --qformat fp8 --export_path /dockerx/ll2_7b_quantized_fp8 --calib_size 256 + +import argparse +import copy +import random +import time + +import numpy as np +import torch +from datasets import load_dataset +from torch.utils.data import DataLoader +from transformers import AutoModelForCausalLM, AutoTokenizer + +import ammo.torch.quantization as atq +from ammo.torch.export import export_model_config + +RAND_SEED = 1234 +MAX_SEQ_LEN = 2048 + +QUANT_CFG_CHOICES = { + "int8_sq": atq.INT8_SMOOTHQUANT_CFG, + "fp8": atq.FP8_DEFAULT_CFG, + "int4_awq": atq.INT4_AWQ_CFG, +} + +def get_calib_dataloader(data="cnn_dailymail", tokenizer=None, batch_size=1, calib_size=512, block_size=512, device=None): + print("Loading calibration dataset") + if data == "pileval": + dataset = load_dataset("json", data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", split="train") + dataset = dataset["text"][:calib_size] + elif data == "cnn_dailymail": + dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train") + dataset = dataset["article"][:calib_size] + else: + raise NotImplementedError + batch_encoded = tokenizer.batch_encode_plus( + dataset, return_tensors="pt", padding=True, truncation=True, max_length=block_size + ) + if device: + batch_encoded = batch_encoded.to(device) + batch_encoded = batch_encoded["input_ids"] + calib_dataloader = DataLoader(batch_encoded, batch_size=batch_size, shuffle=False) + return calib_dataloader + + +def get_tokenizer(ckpt_path, max_seq_len=MAX_SEQ_LEN): + print(f"Initializing tokenizer from {ckpt_path}") + tokenizer = AutoTokenizer.from_pretrained( + ckpt_path, + model_max_length=max_seq_len, + padding_side="left", + trust_remote_code=True, + ) + tokenizer.pad_token = tokenizer.eos_token + return tokenizer + + +def get_model(ckpt_path, dtype="fp16", device="cuda"): + print(f"Initializing model from {ckpt_path}") + if dtype == "bf16": + dtype = torch.bfloat16 + elif dtype == "fp16": + dtype = torch.float16 + elif dtype == "fp32": + dtype = torch.float32 + else: + raise NotImplementedError(f"Unknown dtype {dtype}") + model_kwargs = {"torch_dtype": dtype} + model = AutoModelForCausalLM.from_pretrained(ckpt_path, device_map="auto", **model_kwargs, trust_remote_code=True) + model.eval() + return model + + +def quantize_model(model, quant_cfg, calib_dataloader=None): + def calibrate_loop(): + """Adjusts weights and scaling factors based on selected algorithms.""" + for idx, data in enumerate(calib_dataloader): + print(f"Calibrating batch {idx}") + model(data) + + print("Starting quantization...") + start_time = time.time() + atq.quantize(model, quant_cfg, forward_loop=calibrate_loop) + end_time = time.time() + print(f"Quantization done. Total time used: {end_time - start_time}s") + return model + + +def _register_falcon_linears(model): + """Register Falcon linear modules as Quantiation. + + As falcon models could use remote code, which will be loaded dynamically, to build their model. + Therefore, we need to register the linear on the fly before quantization. + + """ + if type(model).__name__ in ["RWForCausalLM", "FalconForCausalLM"]: + from ammo.torch.quantization import tensor_quant + from ammo.torch.quantization.nn.modules.quant_module import QuantLinearConvBase + + linear_type = type(model.transformer.h[0].self_attention.dense) + + class QuantFalconLinearRW1B(linear_type, QuantLinearConvBase): # type: ignore + default_quant_desc_weight = tensor_quant.QUANT_DESC_8BIT_LINEAR_WEIGHT_PER_ROW + + atq.module_mapping.QUANT_MODULE_MAPPING[linear_type] = QuantFalconLinearRW1B.convert + + + +def main(args): + if not torch.cuda.is_available(): + raise EnvironmentError("GPU is required for inference.") + + random.seed(RAND_SEED) + np.random.seed(RAND_SEED) + + tokenizer = get_tokenizer(args.pyt_ckpt_path) + model = get_model(args.pyt_ckpt_path, args.dtype, args.device) + + _register_falcon_linears(model) + if args.qformat in ["fp8", "int8_sq", "int4_awq"]: + if args.qformat == "int4_awq": + if args.calib_size > 32: + calib_size = 32 + print( + f"AWQ calibration could take longer with calib_size = {args.calib_size}, Using" + f" calib_size={calib_size} instead" + ) + print( + "\nAWQ calibration could take longer than other calibration methods. Please" + " increase the batch size to speed up the calibration process. Batch size can be" + " set by adding the argument --batch_size to the command line.\n" + ) + else: + calib_size = args.calib_size + + calib_dataloader = get_calib_dataloader( + tokenizer=tokenizer, + batch_size=args.batch_size, + calib_size=calib_size, + device=args.device, + ) + if args.qformat in QUANT_CFG_CHOICES: + quant_cfg = QUANT_CFG_CHOICES[args.qformat] + else: + raise ValueError(f"Unsupported quantization format: {args.qformat}") + + if args.qformat == "int4_awq": + quant_cfg = copy.deepcopy(atq.INT4_AWQ_CFG) + quant_cfg["quant_cfg"]["*weight_quantizer"]["block_sizes"][-1] = args.awq_block_size # type: ignore + + model = quantize_model(model, quant_cfg, calib_dataloader) + else: + print(f"No quantization applied, export {args.dtype} model") + + + with torch.inference_mode(): + if any([k in type(model).__name__ for k in ["Llama", "Mistral"]]): + model_type = "llama" + elif "GPTJ" in type(model).__name__: + model_type = "gptj" + elif type(model).__name__ in ["FalconForCausalLM", "RWForCausalLM"]: + model_type = "falcon" + elif "baichuan" in type(model).__name__.lower(): + model_type = "baichuan" + elif "MPT" in type(model).__name__: + model_type = "mpt" + else: + print(f"Unknown model type {type(model).__name__}. Continue exporting...") + model_type = f"unknown:{type(model).__name__}" + + export_path = args.export_path + start_time = time.time() + export_model_config( + model, + model_type, + torch.float16, + export_dir=export_path, + inference_tensor_parallel=int(args.inference_gpus), + ) + end_time = time.time() + print( + f"Quantized model exported to :{export_path}. Total time used {end_time - start_time}s" + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--pyt_ckpt_path", help="Specify where the PyTorch checkpoint path is", required=True) + parser.add_argument("--device", default="cuda") + parser.add_argument("--dtype", help="Model data type.", default="fp16") + parser.add_argument("--qformat", help="Quantization format.", default="fp8") + parser.add_argument("--batch_size", help="Batch size for calibration.", type=int, default=1) + parser.add_argument("--calib_size", help="Number of samples for calibration.", type=int, default=512) + parser.add_argument("--export_path", default="exported_model") + parser.add_argument("--inference_gpus", default=1) + parser.add_argument("--awq_block_size", default=128) + + args = parser.parse_args() + + main(args) + + From 644b16504bf5f64c2d4eb32db39b5a70ee4d2fd3 Mon Sep 17 00:00:00 2001 From: HaiShaw Date: Tue, 6 Feb 2024 17:18:09 -0800 Subject: [PATCH 005/159] Update 3rdparty quantizer utility and usage with ammo updates --- 3rdparty/README.md | 32 ++- 3rdparty/quantizer/hf_ptq.py | 211 ------------------ 3rdparty/quantizer/quantize.py | 380 +++++++++++++++++++++++++++++++++ 3 files changed, 408 insertions(+), 215 deletions(-) delete mode 100644 3rdparty/quantizer/hf_ptq.py create mode 100644 3rdparty/quantizer/quantize.py diff --git a/3rdparty/README.md b/3rdparty/README.md index afa2a9d4c7657..1ad63fabf5bed 100644 --- a/3rdparty/README.md +++ b/3rdparty/README.md @@ -1,8 +1,32 @@ -### quantizer utilities -`quantizer/hf_ptq.py`: Quantization utilities from AMMO and/or TensorRT-LLM, usage embedded at top +### Quantizer Utilities +`quantizer/quantize.py`: nVIDIA Quantization utilities using AMMO, ported from TensorRT-LLM: +`https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py` -### AMMO (AlgorithMic Model Optimization) Installation +### Prerequisite + +#### AMMO (AlgorithMic Model Optimization) Installation: nvidia-ammo 0.7.1 or later `pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo` -### AMMO Download +#### AMMO Download (code and docs) `https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.5.0.tar.gz` +`https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.7.1.tar.gz` + +### Usage + +#### Run on H100 system for speed if FP8; number of GPUs depends on the model size + +#### Example: quantize Llama2-7b model from HF to FP8 with FP8 KV Cache: +`python quantize.py --model_dir ./ll2-7b --dtype float16 --qformat fp8 --kv_cache_dtype fp8 --output_dir ./ll2_7b_fp8 --calib_size 512 --tp_size 1` + +Outputs: model structure, quantized model & parameters (with scaling factors) are in JSON and Safetensors (npz is generated only for the reference) +``` +# ll ./ll2_7b_fp8/ +total 19998244 +drwxr-xr-x 2 root root 4096 Feb 7 01:08 ./ +drwxrwxr-x 8 1060 1061 4096 Feb 7 01:08 ../ +-rw-r--r-- 1 root root 176411 Feb 7 01:08 llama_tp1.json +-rw-r--r-- 1 root root 13477087480 Feb 7 01:09 llama_tp1_rank0.npz +-rw-r--r-- 1 root root 7000893272 Feb 7 01:08 rank0.safetensors +# +``` + diff --git a/3rdparty/quantizer/hf_ptq.py b/3rdparty/quantizer/hf_ptq.py deleted file mode 100644 index 5597ba9991e49..0000000000000 --- a/3rdparty/quantizer/hf_ptq.py +++ /dev/null @@ -1,211 +0,0 @@ -# with AMMO installed, do below: -# python hf_ptq.py --pyt_ckpt_path="./ll2-7b" --export_path=ll2_7b_ptq_fp8 --qformat=fp8 --calib_size=128 --inference_gpus=1 -# python hf_ptq.py --pyt_ckpt_path= \ -# --export_path=llama_ptq \ -# --qformat=fp8 \ -# --calib_size=128 \ -# --inference_gpus=1 -# -# with TensorRT-LLM is installed, similarly do below: -# /dockerx/TensorRT-LLM/examples/quantization# python quantize.py --model_dir /dockerx/ll2-7b --dtype float16 --qformat fp8 --export_path /dockerx/ll2_7b_quantized_fp8 --calib_size 256 - -import argparse -import copy -import random -import time - -import numpy as np -import torch -from datasets import load_dataset -from torch.utils.data import DataLoader -from transformers import AutoModelForCausalLM, AutoTokenizer - -import ammo.torch.quantization as atq -from ammo.torch.export import export_model_config - -RAND_SEED = 1234 -MAX_SEQ_LEN = 2048 - -QUANT_CFG_CHOICES = { - "int8_sq": atq.INT8_SMOOTHQUANT_CFG, - "fp8": atq.FP8_DEFAULT_CFG, - "int4_awq": atq.INT4_AWQ_CFG, -} - -def get_calib_dataloader(data="cnn_dailymail", tokenizer=None, batch_size=1, calib_size=512, block_size=512, device=None): - print("Loading calibration dataset") - if data == "pileval": - dataset = load_dataset("json", data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", split="train") - dataset = dataset["text"][:calib_size] - elif data == "cnn_dailymail": - dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train") - dataset = dataset["article"][:calib_size] - else: - raise NotImplementedError - batch_encoded = tokenizer.batch_encode_plus( - dataset, return_tensors="pt", padding=True, truncation=True, max_length=block_size - ) - if device: - batch_encoded = batch_encoded.to(device) - batch_encoded = batch_encoded["input_ids"] - calib_dataloader = DataLoader(batch_encoded, batch_size=batch_size, shuffle=False) - return calib_dataloader - - -def get_tokenizer(ckpt_path, max_seq_len=MAX_SEQ_LEN): - print(f"Initializing tokenizer from {ckpt_path}") - tokenizer = AutoTokenizer.from_pretrained( - ckpt_path, - model_max_length=max_seq_len, - padding_side="left", - trust_remote_code=True, - ) - tokenizer.pad_token = tokenizer.eos_token - return tokenizer - - -def get_model(ckpt_path, dtype="fp16", device="cuda"): - print(f"Initializing model from {ckpt_path}") - if dtype == "bf16": - dtype = torch.bfloat16 - elif dtype == "fp16": - dtype = torch.float16 - elif dtype == "fp32": - dtype = torch.float32 - else: - raise NotImplementedError(f"Unknown dtype {dtype}") - model_kwargs = {"torch_dtype": dtype} - model = AutoModelForCausalLM.from_pretrained(ckpt_path, device_map="auto", **model_kwargs, trust_remote_code=True) - model.eval() - return model - - -def quantize_model(model, quant_cfg, calib_dataloader=None): - def calibrate_loop(): - """Adjusts weights and scaling factors based on selected algorithms.""" - for idx, data in enumerate(calib_dataloader): - print(f"Calibrating batch {idx}") - model(data) - - print("Starting quantization...") - start_time = time.time() - atq.quantize(model, quant_cfg, forward_loop=calibrate_loop) - end_time = time.time() - print(f"Quantization done. Total time used: {end_time - start_time}s") - return model - - -def _register_falcon_linears(model): - """Register Falcon linear modules as Quantiation. - - As falcon models could use remote code, which will be loaded dynamically, to build their model. - Therefore, we need to register the linear on the fly before quantization. - - """ - if type(model).__name__ in ["RWForCausalLM", "FalconForCausalLM"]: - from ammo.torch.quantization import tensor_quant - from ammo.torch.quantization.nn.modules.quant_module import QuantLinearConvBase - - linear_type = type(model.transformer.h[0].self_attention.dense) - - class QuantFalconLinearRW1B(linear_type, QuantLinearConvBase): # type: ignore - default_quant_desc_weight = tensor_quant.QUANT_DESC_8BIT_LINEAR_WEIGHT_PER_ROW - - atq.module_mapping.QUANT_MODULE_MAPPING[linear_type] = QuantFalconLinearRW1B.convert - - - -def main(args): - if not torch.cuda.is_available(): - raise EnvironmentError("GPU is required for inference.") - - random.seed(RAND_SEED) - np.random.seed(RAND_SEED) - - tokenizer = get_tokenizer(args.pyt_ckpt_path) - model = get_model(args.pyt_ckpt_path, args.dtype, args.device) - - _register_falcon_linears(model) - if args.qformat in ["fp8", "int8_sq", "int4_awq"]: - if args.qformat == "int4_awq": - if args.calib_size > 32: - calib_size = 32 - print( - f"AWQ calibration could take longer with calib_size = {args.calib_size}, Using" - f" calib_size={calib_size} instead" - ) - print( - "\nAWQ calibration could take longer than other calibration methods. Please" - " increase the batch size to speed up the calibration process. Batch size can be" - " set by adding the argument --batch_size to the command line.\n" - ) - else: - calib_size = args.calib_size - - calib_dataloader = get_calib_dataloader( - tokenizer=tokenizer, - batch_size=args.batch_size, - calib_size=calib_size, - device=args.device, - ) - if args.qformat in QUANT_CFG_CHOICES: - quant_cfg = QUANT_CFG_CHOICES[args.qformat] - else: - raise ValueError(f"Unsupported quantization format: {args.qformat}") - - if args.qformat == "int4_awq": - quant_cfg = copy.deepcopy(atq.INT4_AWQ_CFG) - quant_cfg["quant_cfg"]["*weight_quantizer"]["block_sizes"][-1] = args.awq_block_size # type: ignore - - model = quantize_model(model, quant_cfg, calib_dataloader) - else: - print(f"No quantization applied, export {args.dtype} model") - - - with torch.inference_mode(): - if any([k in type(model).__name__ for k in ["Llama", "Mistral"]]): - model_type = "llama" - elif "GPTJ" in type(model).__name__: - model_type = "gptj" - elif type(model).__name__ in ["FalconForCausalLM", "RWForCausalLM"]: - model_type = "falcon" - elif "baichuan" in type(model).__name__.lower(): - model_type = "baichuan" - elif "MPT" in type(model).__name__: - model_type = "mpt" - else: - print(f"Unknown model type {type(model).__name__}. Continue exporting...") - model_type = f"unknown:{type(model).__name__}" - - export_path = args.export_path - start_time = time.time() - export_model_config( - model, - model_type, - torch.float16, - export_dir=export_path, - inference_tensor_parallel=int(args.inference_gpus), - ) - end_time = time.time() - print( - f"Quantized model exported to :{export_path}. Total time used {end_time - start_time}s" - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--pyt_ckpt_path", help="Specify where the PyTorch checkpoint path is", required=True) - parser.add_argument("--device", default="cuda") - parser.add_argument("--dtype", help="Model data type.", default="fp16") - parser.add_argument("--qformat", help="Quantization format.", default="fp8") - parser.add_argument("--batch_size", help="Batch size for calibration.", type=int, default=1) - parser.add_argument("--calib_size", help="Number of samples for calibration.", type=int, default=512) - parser.add_argument("--export_path", default="exported_model") - parser.add_argument("--inference_gpus", default=1) - parser.add_argument("--awq_block_size", default=128) - - args = parser.parse_args() - - main(args) - - diff --git a/3rdparty/quantizer/quantize.py b/3rdparty/quantizer/quantize.py new file mode 100644 index 0000000000000..a68f21a89c65d --- /dev/null +++ b/3rdparty/quantizer/quantize.py @@ -0,0 +1,380 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Adapted from examples/quantization/hf_ptq.py +""" + +import argparse +import copy +import json +import random +import time + +import ammo.torch.quantization as atq +import numpy as np +import torch +from ammo.torch.export import export_model_config +from datasets import load_dataset +from torch.utils.data import DataLoader +from transformers import AutoModelForCausalLM, AutoTokenizer + +RAND_SEED = 1234 +MAX_SEQ_LEN = 2048 + +EMPTY_CFG = { + "quant_cfg": { + "*weight_quantizer": { + "enable": False, + }, + "*input_quantizer": { + "enable": False + }, + "*lm_head*": { + "enable": False + }, + "*output_layer*": { + "enable": False + }, + "default": { + "enable": False + }, + }, + "algorithm": "max", +} + +KV_CACHE_CFG = { + "*.query_key_value.output_quantizer": { + "num_bits": 8, + "axis": None, + "enable": True + }, + "*.Wqkv.output_quantizer": { + "num_bits": 8, + "axis": None, + "enable": True + }, + "*.W_pack.output_quantizer": { + "num_bits": 8, + "axis": None, + "enable": True + }, + "*.c_attn.output_quantizer": { + "num_bits": 8, + "axis": None, + "enable": True + }, + "*.k_proj.output_quantizer": { + "num_bits": 8, + "axis": None, + "enable": True + }, + "*.v_proj.output_quantizer": { + "num_bits": 8, + "axis": None, + "enable": True + }, +} + +QUANT_CFG_CHOICES = { + "int8_sq": atq.INT8_SMOOTHQUANT_CFG, + "fp8": atq.FP8_DEFAULT_CFG, + "int4_awq": atq.INT4_AWQ_CFG, + "w4a8_awq": atq.W4A8_AWQ_BETA_CFG, + "int8_wo": EMPTY_CFG, + "int4_wo": EMPTY_CFG, + "full_prec": EMPTY_CFG, +} + +MODEL_NAME_PATTERN_MAP = { + "GPT2": "gpt2", + "Xverse": "llama", + "Llama": "llama", + "Mistral": "llama", + "GPTJ": "gptj", + "FalconForCausalLM": "falcon", + "RWForCausalLM": "falcon", + "baichuan": "baichuan", + "MPT": "mpt", + "Bloom": "bloom", + "ChatGLM": "chatglm", + "QWen": "qwen", +} + + +def get_tokenizer(ckpt_path, max_seq_len=MAX_SEQ_LEN, model_type=None): + print(f"Initializing tokenizer from {ckpt_path}") + tokenizer = AutoTokenizer.from_pretrained( + ckpt_path, + model_max_length=max_seq_len, + padding_side="left", + trust_remote_code=True, + ) + if model_type and model_type == "qwen": + # qwen use token id 151643 as pad and eos tokens + tokenizer.pad_token = tokenizer.convert_ids_to_tokens(151643) + tokenizer.eos_token = tokenizer.convert_ids_to_tokens(151643) + + # can't set attribute 'pad_token' for "" + if tokenizer.pad_token != "": + tokenizer.pad_token = tokenizer.eos_token + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + assert tokenizer.pad_token is not None, f"Pad token for {model_type} cannot be set!" + + return tokenizer + + +def get_model(ckpt_path, dtype="fp16", device="cuda"): + print(f"Initializing model from {ckpt_path}") + if dtype == "bf16" or dtype == "bfloat16": + dtype = torch.bfloat16 + elif dtype == "fp16" or dtype == "float16": + dtype = torch.float16 + elif dtype == "fp32" or dtype == "float32": + dtype = torch.float32 + else: + raise NotImplementedError(f"Unknown dtype {dtype}") + + # model_kwargs = {"torch_dtype": dtype} + model_kwargs = {"torch_dtype": "auto"} + + model = AutoModelForCausalLM.from_pretrained(ckpt_path, + device_map="auto", + **model_kwargs, + trust_remote_code=True) + model.eval() + + model_dtype = next(model.parameters()).dtype + if dtype != model_dtype: + print( + f"[TensorRT-LLM][WARNING] The manually set model data type is {dtype}, " + f"but the data type of the HuggingFace model is {model_dtype}.") + + return model + + +def get_model_type(model): + for k, v in MODEL_NAME_PATTERN_MAP.items(): + if k.lower() in type(model).__name__.lower(): + return v + return None + + +def get_calib_dataloader(data="cnn_dailymail", + tokenizer=None, + batch_size=1, + calib_size=512, + block_size=512, + device=None): + print("Loading calibration dataset") + if data == "pileval": + dataset = load_dataset( + "json", + data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", + split="train") + dataset = dataset["text"][:calib_size] + elif data == "cnn_dailymail": + dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train") + dataset = dataset["article"][:calib_size] + else: + raise NotImplementedError + + batch_encoded = tokenizer.batch_encode_plus(dataset, + return_tensors="pt", + padding=True, + truncation=True, + max_length=block_size) + if device: + batch_encoded = batch_encoded.to(device) + batch_encoded = batch_encoded["input_ids"] + + calib_dataloader = DataLoader(batch_encoded, + batch_size=batch_size, + shuffle=False) + + return calib_dataloader + + +def quantize_model(model, quant_cfg, calib_dataloader=None): + + def calibrate_loop(): + if calib_dataloader is None: + return + """Adjusts weights and scaling factors based on selected algorithms.""" + for idx, data in enumerate(calib_dataloader): + print(f"Calibrating batch {idx}") + model(data) + + print("Starting quantization...") + start_time = time.time() + atq.quantize(model, quant_cfg, forward_loop=calibrate_loop) + end_time = time.time() + print("Quantization done. Total time used: {:.2f} s.".format(end_time - + start_time)) + + return model + + +def main(args): + if not torch.cuda.is_available(): + raise EnvironmentError("GPU is required for inference.") + + random.seed(RAND_SEED) + np.random.seed(RAND_SEED) + + model = get_model(args.model_dir, args.dtype, args.device) + model_type = get_model_type(model) + tokenizer = get_tokenizer(args.model_dir, model_type=model_type) + + if args.qformat in ["full_prec", "int8_wo", "int4_wo" + ] and args.kv_cache_dtype is None: + print(f"No quantization applied, export {args.dtype} model") + else: + if "awq" in args.qformat: + if args.calib_size > 32: + print( + f"AWQ calibration could take longer with calib_size = {args.calib_size}, Using" + " calib_size=32 instead") + args.calib_size = 32 + print( + "\nAWQ calibration could take longer than other calibration methods. Please" + " increase the batch size to speed up the calibration process. Batch size can be" + " set by adding the argument --batch_size to the command line.\n" + ) + + calib_dataloader = get_calib_dataloader( + tokenizer=tokenizer, + batch_size=args.batch_size, + calib_size=args.calib_size, + device=args.device, + ) + + if args.qformat in QUANT_CFG_CHOICES: + quant_cfg = QUANT_CFG_CHOICES[args.qformat] + else: + raise ValueError(f"Unsupported quantization format: {args.qformat}") + + if "awq" in args.qformat: + quant_cfg = copy.deepcopy(QUANT_CFG_CHOICES[args.qformat]) + weight_quantizer = quant_cfg["quant_cfg"][ + "*weight_quantizer"] # type: ignore + if isinstance(weight_quantizer, list): + weight_quantizer = weight_quantizer[0] + weight_quantizer["block_sizes"][-1] = args.awq_block_size + + if args.kv_cache_dtype is not None: + if args.kv_cache_dtype == "fp8": + for value in KV_CACHE_CFG.values(): + value.update({"num_bits": (4, 3)}) # type: ignore + quant_cfg["quant_cfg"].update(KV_CACHE_CFG) # type: ignore + + print(quant_cfg) + + model = quantize_model(model, quant_cfg, calib_dataloader) + + with torch.inference_mode(): + if model_type is None: + print( + f"Unknown model type {type(model).__name__}. Continue exporting..." + ) + model_type = f"unknown:{type(model).__name__}" + + export_path = args.output_dir + start_time = time.time() + + if args.qformat == "int4_awq" and model_type == "qwen": + torch.save(model.state_dict(), export_path) + else: + export_npz = (model_type not in [ + 'gptj', 'falcon', 'chatglm', 'mpt', 'llama', 'baichuan' + ]) + + # export safetensors + export_model_config(model, + model_type, + getattr(torch, args.dtype), + export_dir=export_path, + inference_tensor_parallel=args.tp_size, + inference_pipeline_parallel=args.pp_size, + # export_tensorrt_llm_config=(not export_npz), + export_tensorrt_llm_config=False, + export_npz=export_npz) + + # export npz (reference) + export_model_config(model, + model_type, + getattr(torch, args.dtype), + export_dir=export_path, + inference_tensor_parallel=args.tp_size, + inference_pipeline_parallel=args.pp_size, + # export_tensorrt_llm_config=(not export_npz), + export_tensorrt_llm_config=False, + # export_npz=export_npz, + export_npz=True) + + # Workaround for wo quantization + if args.qformat in ["int8_wo", "int4_wo", "full_prec"]: + with open(f"{export_path}/config.json", 'r') as f: + tensorrt_llm_config = json.load(f) + if args.qformat == "int8_wo": + tensorrt_llm_config["quantization"]["quant_algo"] = 'W8A16' + elif args.qformat == "int4_wo": + tensorrt_llm_config["quantization"]["quant_algo"] = 'W4A16' + else: + tensorrt_llm_config["quantization"]["quant_algo"] = None + with open(f"{export_path}/config.json", "w") as f: + json.dump(tensorrt_llm_config, f, indent=4) + + end_time = time.time() + print( + "Quantized model exported to {} \nTotal time used {:.2f} s.".format( + export_path, end_time - start_time)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--model_dir", + help="Specify where the HuggingFace model is", + required=True) + parser.add_argument("--device", default="cuda") + parser.add_argument("--dtype", help="Model data type.", default="float16") + parser.add_argument( + "--qformat", + help="Quantization format.", + default="full_prec", + choices=[ + "fp8", "int8_sq", "int4_awq", "w4a8_awq", "int8_wo", "int4_wo", + "full_prec" + ], + ) + parser.add_argument("--batch_size", + help="Batch size for calibration.", + type=int, + default=1) + parser.add_argument("--calib_size", + help="Number of samples for calibration.", + type=int, + default=512) + parser.add_argument("--output_dir", default="exported_model") + parser.add_argument("--tp_size", type=int, default=1) + parser.add_argument("--pp_size", type=int, default=1) + parser.add_argument("--awq_block_size", type=int, default=128) + parser.add_argument("--kv_cache_dtype", + help="KV Cache dtype.", + default=None, + choices=["int8", "fp8", None]) + args = parser.parse_args() + + main(args) From 0ed1d98225f4554e3de6aa5cc8a47bce139d5fc1 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Wed, 7 Feb 2024 13:51:56 -0800 Subject: [PATCH 006/159] Use e4m3 and e5m2 interchangeably --- csrc/attention/dtype_fp8_e5m2.cuh | 4 ++-- csrc/cache_kernels.cu | 19 +++++++++++++++---- .../fp8/amd_detail/quant_utils.cuh | 3 ++- tests/kernels/test_cache.py | 8 +++++--- 4 files changed, 24 insertions(+), 10 deletions(-) diff --git a/csrc/attention/dtype_fp8_e5m2.cuh b/csrc/attention/dtype_fp8_e5m2.cuh index 0580fbb8e863f..7d9e15e24237d 100644 --- a/csrc/attention/dtype_fp8_e5m2.cuh +++ b/csrc/attention/dtype_fp8_e5m2.cuh @@ -8,7 +8,7 @@ #endif namespace vllm { -#ifdef ENABLE_FP8_E5M2 +//#ifdef ENABLE_FP8_E5M2 // fp8 vector types for quantization of kv cache template<> @@ -30,6 +30,6 @@ template<> struct Vec { using Type = uint2; }; -#endif // ENABLE_FP8_E5M2 +//#endif // ENABLE_FP8_E5M2 } // namespace vllm diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index ceb7347d94670..f8fbe345d487d 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -6,6 +6,8 @@ #include "dispatch_utils.h" #ifdef ENABLE_FP8_E5M2 #include "quantization/fp8_e5m2_kvcache/quant_utils.cuh" +#else +#include "quantization/fp8/amd_detail/quant_utils.cuh" #endif #include @@ -200,7 +202,8 @@ __global__ void reshape_and_cache_kernel( key_cache[tgt_key_idx] = fp8_e5m2_unscaled::vec_conversion(tgt_key); value_cache[tgt_value_idx] = fp8_e5m2_unscaled::vec_conversion(tgt_value); #else - assert(false); + key_cache[tgt_key_idx] = fp8_e4m3::vec_conversion(tgt_key); + value_cache[tgt_value_idx] = fp8_e4m3::vec_conversion(tgt_value); #endif } else { key_cache[tgt_key_idx] = tgt_key; @@ -438,10 +441,10 @@ __global__ void convert_fp8_e5m2_kernel( const int64_t block_idx = blockIdx.x; for (int i = threadIdx.x; i < block_stride; i += blockDim.x) { int64_t idx = block_idx * block_stride + i; -#ifdef ENABLE_FP8_E5M2 + #ifdef ENABLE_FP8_E5M2 dst_cache[idx] = fp8_e5m2_unscaled::vec_conversion(src_cache[idx]); #else - assert(false); + dst_cache[idx] = fp8_e4m3::vec_conversion(src_cache[idx]); #endif } } @@ -458,13 +461,21 @@ void convert_fp8_e5m2( torch::Tensor& src_cache, torch::Tensor& dst_cache) { + torch::Device src_device = src_cache.device(); + torch::Device dst_device = dst_cache.device(); + if (src_device.is_cuda() && dst_device.is_cuda()) { + TORCH_CHECK( + src_device.index() == dst_device.index(), + "src and dst must be on the same GPU"); + } + const at::cuda::OptionalCUDAGuard device_guard(device_of(src_cache)); int64_t num_blocks = src_cache.size(0); int64_t block_stride = src_cache.stride(0); dim3 grid(num_blocks); dim3 block(std::min(block_stride, int64_t(512))); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - + if (src_cache.dtype() == at::ScalarType::Float) { CALL_CONVERT_FP8_E5M2(uint8_t, float); } else if (src_cache.dtype() == at::ScalarType::Half) { diff --git a/csrc/quantization/fp8/amd_detail/quant_utils.cuh b/csrc/quantization/fp8/amd_detail/quant_utils.cuh index afd37cc9da0d3..7bc70d9264ab8 100644 --- a/csrc/quantization/fp8/amd_detail/quant_utils.cuh +++ b/csrc/quantization/fp8/amd_detail/quant_utils.cuh @@ -10,7 +10,7 @@ namespace vllm { - +namespace fp8_e4m3 { template __inline__ __device__ Tout vec_conversion(const Tin& x) { @@ -290,4 +290,5 @@ __inline__ __device__ bf16_8_t vec_conversion(const Float8_& b.w = __float22bfloat162_rn(a.w); return b; } +} } // namespace vllm diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 275ef8194d0bd..52eb16d8d04c5 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -99,6 +99,7 @@ def test_copy_blocks( @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @torch.inference_mode() def test_reshape_and_cache( kv_cache_factory, @@ -110,6 +111,7 @@ def test_reshape_and_cache( dtype: torch.dtype, seed: int, device: int, + kv_cache_dtype: str, ) -> None: random.seed(seed) torch.random.manual_seed(seed) @@ -130,8 +132,8 @@ def test_reshape_and_cache( # Create the KV caches. key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1, - num_heads, head_size, dtype, - None, seed, gpu_id) + num_heads, head_size, kv_cache_dtype, + dtype, seed, gpu_id) key_cache, value_cache = key_caches[0], value_caches[0] # Clone the KV caches. @@ -140,7 +142,7 @@ def test_reshape_and_cache( # Call the reshape_and_cache kernel. cache_ops.reshape_and_cache(key, value, key_cache, value_cache, - slot_mapping, "auto") + slot_mapping, kv_cache_dtype) # Run the reference implementation. reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape) From 18b2516638cf5c66ceacfb2a41a9df029aee960d Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Wed, 7 Feb 2024 14:41:42 -0800 Subject: [PATCH 007/159] Using fp8 in any cache tests that could support it --- csrc/cache_kernels.cu | 8 +++++++- tests/kernels/test_cache.py | 10 ++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index f8fbe345d487d..239972a774d86 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -468,7 +468,13 @@ void convert_fp8_e5m2( src_device.index() == dst_device.index(), "src and dst must be on the same GPU"); } - const at::cuda::OptionalCUDAGuard device_guard(device_of(src_cache)); + at::cuda::OptionalCUDAGuard device_guard; + + if (src_device.is_cuda()) { + device_guard.set_device(src_device); + } else if (dst_device.is_cuda()) { + device_guard.set_device(dst_device); + } int64_t num_blocks = src_cache.size(0); int64_t block_stride = src_cache.stride(0); diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 52eb16d8d04c5..658f52ad8eee8 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -113,6 +113,8 @@ def test_reshape_and_cache( device: int, kv_cache_dtype: str, ) -> None: + if kv_cache_dtype != "auto": + return # No alternative fp8 operation to compare to random.seed(seed) torch.random.manual_seed(seed) torch.cuda.manual_seed(seed) @@ -169,6 +171,7 @@ def test_reshape_and_cache( @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @torch.inference_mode() def test_swap_blocks( kv_cache_factory, @@ -181,7 +184,10 @@ def test_swap_blocks( dtype: torch.dtype, seed: int, device: int, + kv_cache_dtype: str, ) -> None: + if kv_cache_dtype == "fp8_e5m2" and "cpu" in direction: + return random.seed(seed) torch.random.manual_seed(seed) torch.cuda.manual_seed(seed) @@ -202,12 +208,12 @@ def test_swap_blocks( # Create the KV caches on the first device. src_key_caches, src_value_caches = kv_cache_factory( - num_blocks, block_size, 1, num_heads, head_size, dtype, None, seed, + num_blocks, block_size, 1, num_heads, head_size, kv_cache_dtype, dtype, seed, src_device) # Create the KV caches on the second device. dist_key_caches, dist_value_caches = kv_cache_factory( - num_blocks, block_size, 1, num_heads, head_size, dtype, None, seed, + num_blocks, block_size, 1, num_heads, head_size, kv_cache_dtype, dtype, seed, dst_device) src_key_caches_clone = src_key_caches[0].clone() From 81a68593ab989dcbcc7e2612e7c6c033285acfd2 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Thu, 8 Feb 2024 08:48:16 -0800 Subject: [PATCH 008/159] Integrate e4m3 alongside e5m2 and adapt cache tests --- .gitignore | 1 + csrc/attention/attention_dtypes.h | 2 +- .../{dtype_fp8_e5m2.cuh => dtype_fp8.cuh} | 4 +- csrc/cache.h | 2 +- csrc/cache_kernels.cu | 36 +++++----- csrc/pybind.cpp | 4 +- setup.py | 1 + tests/kernels/test_attention.py | 4 +- tests/kernels/test_cache.py | 65 +++++++++++++++++-- 9 files changed, 88 insertions(+), 31 deletions(-) rename csrc/attention/{dtype_fp8_e5m2.cuh => dtype_fp8.cuh} (85%) diff --git a/.gitignore b/.gitignore index b5195629e5cf3..b1513ef0ddb0c 100644 --- a/.gitignore +++ b/.gitignore @@ -181,6 +181,7 @@ _build/ # hip files generated by PyTorch *.hip *_hip* +hip_compat.h # Benchmark dataset *.json diff --git a/csrc/attention/attention_dtypes.h b/csrc/attention/attention_dtypes.h index 61748e6b1eee6..64f86381d9db9 100644 --- a/csrc/attention/attention_dtypes.h +++ b/csrc/attention/attention_dtypes.h @@ -4,4 +4,4 @@ #include "dtype_float16.cuh" #include "dtype_float32.cuh" #include "dtype_bfloat16.cuh" -#include "dtype_fp8_e5m2.cuh" +#include "dtype_fp8.cuh" diff --git a/csrc/attention/dtype_fp8_e5m2.cuh b/csrc/attention/dtype_fp8.cuh similarity index 85% rename from csrc/attention/dtype_fp8_e5m2.cuh rename to csrc/attention/dtype_fp8.cuh index 7d9e15e24237d..d11dee91ebe87 100644 --- a/csrc/attention/dtype_fp8_e5m2.cuh +++ b/csrc/attention/dtype_fp8.cuh @@ -8,7 +8,7 @@ #endif namespace vllm { -//#ifdef ENABLE_FP8_E5M2 +#if defined(ENABLE_FP8_E5M2) || defined(ENABLE_FP8_E4M3) // fp8 vector types for quantization of kv cache template<> @@ -30,6 +30,6 @@ template<> struct Vec { using Type = uint2; }; -//#endif // ENABLE_FP8_E5M2 +#endif // ENABLE_FP8_E5M2 } // namespace vllm diff --git a/csrc/cache.h b/csrc/cache.h index 21c71830f7942..aafee5524fe2c 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -31,6 +31,6 @@ void gather_cached_kv( torch::Tensor& slot_mapping); // Just for unittest -void convert_fp8_e5m2( +void convert_fp8( torch::Tensor& src_cache, torch::Tensor& dst_cache); diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 239972a774d86..39500521989be 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -4,9 +4,9 @@ #include "cuda_compat.h" #include "dispatch_utils.h" -#ifdef ENABLE_FP8_E5M2 +#if defined(ENABLE_FP8_E5M2) #include "quantization/fp8_e5m2_kvcache/quant_utils.cuh" -#else +#else if defined(ENABLE_FP8_E4M3) #include "quantization/fp8/amd_detail/quant_utils.cuh" #endif @@ -198,12 +198,14 @@ __global__ void reshape_and_cache_kernel( scalar_t tgt_key = key[src_key_idx]; scalar_t tgt_value = value[src_value_idx]; if constexpr (is_fp8_e5m2_kv_cache) { -#ifdef ENABLE_FP8_E5M2 +#if defined(ENABLE_FP8_E5M2) key_cache[tgt_key_idx] = fp8_e5m2_unscaled::vec_conversion(tgt_key); value_cache[tgt_value_idx] = fp8_e5m2_unscaled::vec_conversion(tgt_value); -#else +#elif defined(ENABLE_FP8_E4M3) key_cache[tgt_key_idx] = fp8_e4m3::vec_conversion(tgt_key); value_cache[tgt_value_idx] = fp8_e4m3::vec_conversion(tgt_value); +#else + assert(false); #endif } else { key_cache[tgt_key_idx] = tgt_key; @@ -434,30 +436,32 @@ void gather_cached_kv( namespace vllm { template -__global__ void convert_fp8_e5m2_kernel( +__global__ void convert_fp8_kernel( const Tin* __restrict__ src_cache, Tout* __restrict__ dst_cache, const int64_t block_stride) { const int64_t block_idx = blockIdx.x; for (int i = threadIdx.x; i < block_stride; i += blockDim.x) { int64_t idx = block_idx * block_stride + i; - #ifdef ENABLE_FP8_E5M2 +#if defined(ENABLE_FP8_E5M2) dst_cache[idx] = fp8_e5m2_unscaled::vec_conversion(src_cache[idx]); -#else +#elif defined(ENABLE_FP8_E4M3) dst_cache[idx] = fp8_e4m3::vec_conversion(src_cache[idx]); +#else + assert(false); #endif } } } // namespace vllm -#define CALL_CONVERT_FP8_E5M2(Tout, Tin) \ - vllm::convert_fp8_e5m2_kernel<<>>( \ +#define CALL_CONVERT_FP8(Tout, Tin) \ + vllm::convert_fp8_kernel<<>>( \ reinterpret_cast(src_cache.data_ptr()), \ reinterpret_cast(dst_cache.data_ptr()), \ block_stride); -void convert_fp8_e5m2( +void convert_fp8( torch::Tensor& src_cache, torch::Tensor& dst_cache) { @@ -483,16 +487,16 @@ void convert_fp8_e5m2( const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); if (src_cache.dtype() == at::ScalarType::Float) { - CALL_CONVERT_FP8_E5M2(uint8_t, float); + CALL_CONVERT_FP8(uint8_t, float); } else if (src_cache.dtype() == at::ScalarType::Half) { - CALL_CONVERT_FP8_E5M2(uint8_t, uint16_t); + CALL_CONVERT_FP8(uint8_t, uint16_t); } else if (src_cache.dtype() == at::ScalarType::BFloat16) { - CALL_CONVERT_FP8_E5M2(uint8_t, __nv_bfloat16); + CALL_CONVERT_FP8(uint8_t, __nv_bfloat16); } else if (dst_cache.dtype() == at::ScalarType::Float) { - CALL_CONVERT_FP8_E5M2(float, uint8_t); + CALL_CONVERT_FP8(float, uint8_t); } else if (dst_cache.dtype() == at::ScalarType::Half) { - CALL_CONVERT_FP8_E5M2(uint16_t, uint8_t); + CALL_CONVERT_FP8(uint16_t, uint8_t); } else if (dst_cache.dtype() == at::ScalarType::BFloat16) { - CALL_CONVERT_FP8_E5M2(__nv_bfloat16, uint8_t); + CALL_CONVERT_FP8(__nv_bfloat16, uint8_t); } } diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp index 8a8235691ab8e..7159952287090 100644 --- a/csrc/pybind.cpp +++ b/csrc/pybind.cpp @@ -80,8 +80,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { &gather_cached_kv, "Gather key and value from the cache into contiguous QKV tensors"); cache_ops.def( - "convert_fp8_e5m2", - &convert_fp8_e5m2, + "convert_fp8", + &convert_fp8, "Convert the key and value cache to fp8_e5m2 data type"); // Cuda utils diff --git a/setup.py b/setup.py index 3e2127855a755..03bd2072a2313 100644 --- a/setup.py +++ b/setup.py @@ -296,6 +296,7 @@ def get_torch_arch_list() -> Set[str]: f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}" f"amdgpu_arch_found: {arch}") NVCC_FLAGS += [f"--offload-arch={arch}"] + NVCC_FLAGS += ["-DENABLE_FP8_E4M3"] elif _is_neuron(): neuronxcc_version = get_neuronxcc_version() diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index cbb1d40623c71..c2116ece165cb 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -230,14 +230,14 @@ def test_paged_attention( dequantized_key_cache = torch.empty(size=key_cache_shape, dtype=dtype, device=gpu_id) - cache_ops.convert_fp8_e5m2(key_cache, dequantized_key_cache) + cache_ops.convert_fp8(key_cache, dequantized_key_cache) key_cache = dequantized_key_cache value_cache_shape = value_cache.shape dequantized_value_cache = torch.empty(size=value_cache_shape, dtype=dtype, device=gpu_id) - cache_ops.convert_fp8_e5m2(value_cache, dequantized_value_cache) + cache_ops.convert_fp8(value_cache, dequantized_value_cache) value_cache = dequantized_value_cache ref_output = torch.empty_like(query) diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 658f52ad8eee8..77a7331d4b94d 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -113,8 +113,6 @@ def test_reshape_and_cache( device: int, kv_cache_dtype: str, ) -> None: - if kv_cache_dtype != "auto": - return # No alternative fp8 operation to compare to random.seed(seed) torch.random.manual_seed(seed) torch.cuda.manual_seed(seed) @@ -139,12 +137,24 @@ def test_reshape_and_cache( key_cache, value_cache = key_caches[0], value_caches[0] # Clone the KV caches. - cloned_key_cache = key_cache.clone() - cloned_value_cache = value_cache.clone() + if kv_cache_dtype == "fp8_e5m2": + cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16) + cache_ops.convert_fp8(key_cache, cloned_key_cache) + cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16) + cache_ops.convert_fp8(value_cache, cloned_value_cache) + else: + cloned_key_cache = key_cache.clone() + cloned_value_cache = value_cache.clone() # Call the reshape_and_cache kernel. cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype) + + if kv_cache_dtype == "fp8_e5m2": + result_key_cache = torch.empty_like(key_cache, dtype=torch.float16) + cache_ops.convert_fp8(key_cache, result_key_cache) + result_value_cache = torch.empty_like(value_cache, dtype=torch.float16) + cache_ops.convert_fp8(value_cache, result_value_cache) # Run the reference implementation. reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape) @@ -157,9 +167,13 @@ def test_reshape_and_cache( block_offset = block_offsets[i] cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i] cloned_value_cache[block_idx, :, :, block_offset] = value[i] - - assert torch.allclose(key_cache, cloned_key_cache) - assert torch.allclose(value_cache, cloned_value_cache) + + if kv_cache_dtype == "fp8_e5m2": + assert torch.allclose(result_key_cache, cloned_key_cache, atol=0.01, rtol=0.1) + assert torch.allclose(result_value_cache, cloned_value_cache, atol=0.01, rtol=0.1) + else: + assert torch.allclose(key_cache, cloned_key_cache) + assert torch.allclose(value_cache, cloned_value_cache) @pytest.mark.parametrize("direction", COPYING_DIRECTION) @@ -229,3 +243,40 @@ def test_swap_blocks( dist_key_caches[0][dst].cpu()) assert torch.allclose(src_value_caches_clone[src].cpu(), dist_value_caches[0][dst].cpu()) + + +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("block_size", BLOCK_SIZES) +@pytest.mark.parametrize("num_blocks", NUM_BLOCKS) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", DEVICES) +@torch.inference_mode() +def test_fp8_conversion( + num_heads: int, + head_size: int, + block_size: int, + num_blocks: int, + dtype: torch.dtype, + seed: int, + device: int, +) -> None: + random.seed(seed) + torch.random.manual_seed(seed) + torch.cuda.manual_seed(seed) + gpu_id = f"cuda:{device}" + + low = -240.0 + high = 240.0 + shape = (num_blocks, num_heads, head_size, block_size) + cache = torch.empty(shape, dtype=dtype, device=gpu_id) + cache.uniform_(low, high) + + cache_fp8 = torch.empty_like(cache, dtype=torch.uint8) + cache_ops.convert_fp8(cache, cache_fp8) + + converted_cache = torch.empty_like(cache) + cache_ops.convert_fp8(cache_fp8, converted_cache) + + assert torch.allclose(cache, converted_cache, atol=0.01, rtol=0.1) From 83089d04d30c490f864f5b29ac9c82faa1cfc50f Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Thu, 8 Feb 2024 08:50:49 -0800 Subject: [PATCH 009/159] Add gfx942 to the arch list --- csrc/cache_kernels.cu | 7 +++---- setup.py | 2 +- vllm/utils.py | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 39500521989be..16dcd05a2919f 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -457,8 +457,8 @@ __global__ void convert_fp8_kernel( #define CALL_CONVERT_FP8(Tout, Tin) \ vllm::convert_fp8_kernel<<>>( \ - reinterpret_cast(src_cache.data_ptr()), \ - reinterpret_cast(dst_cache.data_ptr()), \ + reinterpret_cast(src_cache.data_ptr()), \ + reinterpret_cast(dst_cache.data_ptr()), \ block_stride); void convert_fp8( @@ -473,7 +473,6 @@ void convert_fp8( "src and dst must be on the same GPU"); } at::cuda::OptionalCUDAGuard device_guard; - if (src_device.is_cuda()) { device_guard.set_device(src_device); } else if (dst_device.is_cuda()) { @@ -485,7 +484,7 @@ void convert_fp8( dim3 grid(num_blocks); dim3 block(std::min(block_stride, int64_t(512))); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - + if (src_cache.dtype() == at::ScalarType::Float) { CALL_CONVERT_FP8(uint8_t, float); } else if (src_cache.dtype() == at::ScalarType::Half) { diff --git a/setup.py b/setup.py index 03bd2072a2313..958d5b24d4e5f 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ # Supported NVIDIA GPU architectures. NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"} -ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx1030", "gfx1100"} +ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx942", "gfx1030", "gfx1100"} # SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS) diff --git a/vllm/utils.py b/vllm/utils.py index dc81741498356..cc8a37ab7488a 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -212,7 +212,7 @@ def _generate_random_fp8_e5m2( from vllm._C import cache_ops tensor_tmp = torch.empty_like(tensor, dtype=torch.float16) tensor_tmp.uniform_(low, high) - cache_ops.convert_fp8_e5m2(tensor_tmp, tensor) + cache_ops.convert_fp8(tensor_tmp, tensor) del tensor_tmp From 926e2b867b1bd6f5eb26c2f1c70b98b1108b233c Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Thu, 8 Feb 2024 10:44:08 -0800 Subject: [PATCH 010/159] Less forgiving atol in fp8 tests --- tests/kernels/test_cache.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 77a7331d4b94d..f96c5b04c057e 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -169,8 +169,8 @@ def test_reshape_and_cache( cloned_value_cache[block_idx, :, :, block_offset] = value[i] if kv_cache_dtype == "fp8_e5m2": - assert torch.allclose(result_key_cache, cloned_key_cache, atol=0.01, rtol=0.1) - assert torch.allclose(result_value_cache, cloned_value_cache, atol=0.01, rtol=0.1) + assert torch.allclose(result_key_cache, cloned_key_cache, atol=0.001, rtol=0.1) + assert torch.allclose(result_value_cache, cloned_value_cache, atol=0.001, rtol=0.1) else: assert torch.allclose(key_cache, cloned_key_cache) assert torch.allclose(value_cache, cloned_value_cache) @@ -279,4 +279,4 @@ def test_fp8_conversion( converted_cache = torch.empty_like(cache) cache_ops.convert_fp8(cache_fp8, converted_cache) - assert torch.allclose(cache, converted_cache, atol=0.01, rtol=0.1) + assert torch.allclose(cache, converted_cache, atol=0.001, rtol=0.1) From 1b8bc9f18fe8e79d4c47cb1b6bf097e37c5c0657 Mon Sep 17 00:00:00 2001 From: AdrianAbeyta Date: Fri, 9 Feb 2024 02:36:53 +0000 Subject: [PATCH 011/159] enable fp8-e4m3 kv cache on rocm --- csrc/attention/attention_kernels.cu | 18 ++++++++++++++---- csrc/cache_kernels.cu | 2 +- vllm/config.py | 10 ++++++++++ vllm/engine/arg_utils.py | 2 +- vllm/utils.py | 3 ++- 5 files changed, 28 insertions(+), 7 deletions(-) diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index b5be3befa07e2..27b542fedcb7e 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -29,6 +29,10 @@ #include "../quantization/fp8_e5m2_kvcache/quant_utils.cuh" #endif +#ifdef ENABLE_FP8_E4M3 +#include "../quantization/fp8/amd_detail/quant_utils.cuh" +#endif + #include #ifndef USE_ROCM @@ -150,7 +154,7 @@ __device__ void paged_attention_kernel( constexpr int VEC_SIZE = MAX(16 / (THREAD_GROUP_SIZE * sizeof(scalar_t)), 1); using K_vec = typename Vec::Type; using Q_vec = typename Vec::Type; -#ifdef ENABLE_FP8_E5M2 +#if defined(ENABLE_FP8_E5M2) || defined(ENABLE_FP8_E4M3) using Quant_vec = typename Vec::Type; #endif @@ -221,6 +225,9 @@ __device__ void paged_attention_kernel( Quant_vec k_vec_quant = *reinterpret_cast(k_ptr + offset1 * BLOCK_SIZE * x + offset2); // Vector conversion from Quant_vec to K_vec. k_vecs[j] = fp8_e5m2_unscaled::vec_conversion(k_vec_quant); +#elif defined(ENABLE_FP8_E4M3) + Quant_vec k_vec_quant = *reinterpret_cast(k_ptr + offset1 * BLOCK_SIZE * x + offset2); + k_vecs[j] = fp8_e4m3::vec_conversion(k_vec_quant); #else assert(false); #endif @@ -300,7 +307,7 @@ __device__ void paged_attention_kernel( constexpr int V_VEC_SIZE = MIN(16 / sizeof(scalar_t), BLOCK_SIZE); using V_vec = typename Vec::Type; using L_vec = typename Vec::Type; -#ifdef ENABLE_FP8_E5M2 +#if defined(ENABLE_FP8_E5M2) || defined(ENABLE_FP8_E4M3) using V_quant_vec = typename Vec::Type; #endif using Float_L_vec = typename FloatVec::Type; @@ -341,6 +348,9 @@ __device__ void paged_attention_kernel( V_quant_vec v_quant_vec = *reinterpret_cast(v_ptr + offset); // Vector conversion from V_quant_vec to V_vec. v_vec = fp8_e5m2_unscaled::vec_conversion(v_quant_vec); +#elif defined(ENABLE_FP8_E4M3) + V_quant_vec v_quant_vec = *reinterpret_cast(v_ptr + offset); + v_vec = fp8_e4m3::vec_conversion(v_quant_vec); #else assert(false); #endif @@ -739,7 +749,7 @@ void paged_attention_v1( } else { TORCH_CHECK(false, "Unsupported data type: ", query.dtype()); } - } else if (kv_cache_dtype == "fp8_e5m2") { + } else if (kv_cache_dtype == "fp8_e5m2" or "fp8_e4m3") { if (query.dtype() == at::ScalarType::Float) { CALL_V1_LAUNCHER_BLOCK_SIZE(float, uint8_t, true); } else if (query.dtype() == at::ScalarType::Half) { @@ -932,7 +942,7 @@ void paged_attention_v2( } else { TORCH_CHECK(false, "Unsupported data type: ", query.dtype()); } - } else if (kv_cache_dtype == "fp8_e5m2") { + } else if (kv_cache_dtype == "fp8_e5m2" or "fp8_e4m3") { if (query.dtype() == at::ScalarType::Float) { CALL_V2_LAUNCHER_BLOCK_SIZE(float, uint8_t, true); } else if (query.dtype() == at::ScalarType::Half) { diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 16dcd05a2919f..56745e630920e 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -259,7 +259,7 @@ void reshape_and_cache( } else if (key.dtype() == at::ScalarType::BFloat16) { CALL_RESHAPE_AND_CACHE(__nv_bfloat16, __nv_bfloat16, false); } - } else if (kv_cache_dtype == "fp8_e5m2") { + } else if (kv_cache_dtype == "fp8_e5m2" or "fp8_e4m3") { if (key.dtype() == at::ScalarType::Float) { CALL_RESHAPE_AND_CACHE(float, uint8_t, true); } else if (key.dtype() == at::ScalarType::Half) { diff --git a/vllm/config.py b/vllm/config.py index 4fb7357a3da21..8f76f9d0ffa95 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -324,6 +324,16 @@ def _verify_cache_dtype(self) -> None: "But it may cause slight accuracy drop. " "Currently we only support fp8 without scaling factors and " "make e5m2 as a default format.") + elif self.cache_dtype == "fp8_e4m3": + device_name = torch.cuda.get_device_name() + if not "AMD" in device_name: + raise NotImplementedError( + "FP8_E4M3 KV Cache on NVIDIA GPU has not been supported yet.") + logger.info( + "Using fp8_e4m3 data type to store kv cache. It reduces " + "the GPU memory footprint and boosts the performance. " + "But it may cause slight accuracy drop. " + "Currently we only support fp8 without scaling factors") else: raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}") diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 231ce3321cdc4..46d0a52c5f97f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -126,7 +126,7 @@ def add_cli_args( parser.add_argument( '--kv-cache-dtype', type=str, - choices=['auto', 'fp8_e5m2'], + choices=['auto', 'fp8_e5m2', 'fp8_e4m3'], default='auto', help='Data type for kv cache storage. If "auto", will use model ' 'data type. Note FP8 is not supported when cuda version is ' diff --git a/vllm/utils.py b/vllm/utils.py index cc8a37ab7488a..1b66a13cb5518 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -29,6 +29,7 @@ "bfloat16": torch.bfloat16, "float": torch.float, "fp8_e5m2": torch.uint8, + "fp8_e4m3": torch.uint8, } @@ -240,7 +241,7 @@ def create_kv_caches_with_random( raise ValueError(f"Invalid model dtype: {model_dtype}") elif cache_dtype in ["half", "bfloat16", "float"]: torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype] - elif cache_dtype == "fp8_e5m2": + elif cache_dtype == "fp8_e5m2" or "fp8_e4m3": torch_dtype = torch.uint8 else: raise ValueError(f"Invalid kv cache dtype: {cache_dtype}") From 777cc35c61e7df198a640891a97cf32b2012661c Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Fri, 9 Feb 2024 15:30:59 +0000 Subject: [PATCH 012/159] Rename remaining fp8_e5m2 to general fp8 Reduce fp8 range in the conversion test to match e4m3 Add other MI300 architectures to the list Simplify device guard use in conversion kernel --- benchmarks/benchmark_latency.py | 2 +- benchmarks/benchmark_throughput.py | 2 +- .../kernels/benchmark_paged_attention.py | 2 +- csrc/attention/attention_kernels.cu | 4 ++-- csrc/cache_kernels.cu | 20 ++++++++----------- csrc/pybind.cpp | 2 +- ...fp8_e5m2_kv_cache.rst => fp8_kv_cache.rst} | 4 ++-- setup.py | 2 +- tests/kernels/test_attention.py | 6 +++--- tests/kernels/test_cache.py | 14 ++++++------- vllm/config.py | 19 ++++++++---------- vllm/engine/arg_utils.py | 2 +- vllm/utils.py | 14 ++++++------- 13 files changed, 43 insertions(+), 50 deletions(-) rename docs/source/quantization/{fp8_e5m2_kv_cache.rst => fp8_kv_cache.rst} (92%) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 7173134358762..30df87d1419a8 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -121,7 +121,7 @@ def run_to_completion(profile_dir: Optional[str] = None): parser.add_argument( "--kv-cache-dtype", type=str, - choices=['auto', 'fp8_e5m2'], + choices=['auto', 'fp8'], default='auto', help= 'Data type for kv cache storage. If "auto", will use model data type.') diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index d45d33307c912..83b1182256c8d 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -290,7 +290,7 @@ def main(args: argparse.Namespace): parser.add_argument( "--kv-cache-dtype", type=str, - choices=["auto", "fp8_e5m2"], + choices=["auto", "fp8"], default="auto", help= 'Data type for kv cache storage. If "auto", will use model data type.') diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 56fe1b921d44e..da49d84d97005 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -171,7 +171,7 @@ def run_benchmark(num_iters: int, profile: bool = False) -> float: parser.add_argument( "--kv-cache-dtype", type=str, - choices=["auto", "fp8_e5m2"], + choices=["auto", "fp8"], default="auto", help= 'Data type for kv cache storage. If "auto", will use model data type.') diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index b5be3befa07e2..6aec56bab4b28 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -739,7 +739,7 @@ void paged_attention_v1( } else { TORCH_CHECK(false, "Unsupported data type: ", query.dtype()); } - } else if (kv_cache_dtype == "fp8_e5m2") { + } else if (kv_cache_dtype == "fp8") { if (query.dtype() == at::ScalarType::Float) { CALL_V1_LAUNCHER_BLOCK_SIZE(float, uint8_t, true); } else if (query.dtype() == at::ScalarType::Half) { @@ -932,7 +932,7 @@ void paged_attention_v2( } else { TORCH_CHECK(false, "Unsupported data type: ", query.dtype()); } - } else if (kv_cache_dtype == "fp8_e5m2") { + } else if (kv_cache_dtype == "fp8") { if (query.dtype() == at::ScalarType::Float) { CALL_V2_LAUNCHER_BLOCK_SIZE(float, uint8_t, true); } else if (query.dtype() == at::ScalarType::Half) { diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 16dcd05a2919f..15b75c9acdae2 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -259,7 +259,7 @@ void reshape_and_cache( } else if (key.dtype() == at::ScalarType::BFloat16) { CALL_RESHAPE_AND_CACHE(__nv_bfloat16, __nv_bfloat16, false); } - } else if (kv_cache_dtype == "fp8_e5m2") { + } else if (kv_cache_dtype == "fp8") { if (key.dtype() == at::ScalarType::Float) { CALL_RESHAPE_AND_CACHE(float, uint8_t, true); } else if (key.dtype() == at::ScalarType::Half) { @@ -467,17 +467,13 @@ void convert_fp8( { torch::Device src_device = src_cache.device(); torch::Device dst_device = dst_cache.device(); - if (src_device.is_cuda() && dst_device.is_cuda()) { - TORCH_CHECK( - src_device.index() == dst_device.index(), - "src and dst must be on the same GPU"); - } - at::cuda::OptionalCUDAGuard device_guard; - if (src_device.is_cuda()) { - device_guard.set_device(src_device); - } else if (dst_device.is_cuda()) { - device_guard.set_device(dst_device); - } + TORCH_CHECK(src_device.is_cuda(), "src must be on a GPU") + TORCH_CHECK(dst_device.is_cuda(), "dst must be on a GPU") + TORCH_CHECK( + src_device.index() == dst_device.index(), + "src and dst must be on the same GPU"); + at::cuda::OptionalCUDAGuard device_guard(src_device); + int64_t num_blocks = src_cache.size(0); int64_t block_stride = src_cache.stride(0); diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp index 7159952287090..736a2a6af91ff 100644 --- a/csrc/pybind.cpp +++ b/csrc/pybind.cpp @@ -82,7 +82,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { cache_ops.def( "convert_fp8", &convert_fp8, - "Convert the key and value cache to fp8_e5m2 data type"); + "Convert the key and value cache to fp8 data type"); // Cuda utils pybind11::module cuda_utils = m.def_submodule("cuda_utils", "vLLM cuda utils"); diff --git a/docs/source/quantization/fp8_e5m2_kv_cache.rst b/docs/source/quantization/fp8_kv_cache.rst similarity index 92% rename from docs/source/quantization/fp8_e5m2_kv_cache.rst rename to docs/source/quantization/fp8_kv_cache.rst index 10437260ad964..8db69a9c5b765 100644 --- a/docs/source/quantization/fp8_e5m2_kv_cache.rst +++ b/docs/source/quantization/fp8_kv_cache.rst @@ -1,4 +1,4 @@ -.. _fp8_e5m2_kv_cache: +.. _fp8_kv_cache: FP8 E5M2 KV Cache ================== @@ -20,7 +20,7 @@ Here is an example of how to enable this feature: # Create a sampling params object. sampling_params = SamplingParams(temperature=0.8, top_p=0.95) # Create an LLM. - llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8_e5m2") + llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8") # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/setup.py b/setup.py index 958d5b24d4e5f..e437d1a89ec48 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ # Supported NVIDIA GPU architectures. NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"} -ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx942", "gfx1030", "gfx1100"} +ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx940", "gfx941", "gfx942", "gfx1030", "gfx1100"} # SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS) diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index c2116ece165cb..fe50a60f71adc 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -25,7 +25,7 @@ HEAD_SIZES = [64, 80, 96, 112, 128, 256] BLOCK_SIZES = [16, 32] USE_ALIBI = [False, True] -KV_CACHE_DTYPE = ["auto", "fp8_e5m2"] +KV_CACHE_DTYPE = ["auto", "fp8"] SEEDS = [0] DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)] @@ -222,7 +222,7 @@ def test_paged_attention( raise AssertionError(f"Unknown version: {version}") # Run the reference implementation. - if kv_cache_dtype == "fp8_e5m2": + if kv_cache_dtype == "fp8": # Convert cache data back to dtype. x = 16 // torch.tensor([], dtype=dtype).element_size() key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x, @@ -259,7 +259,7 @@ def test_paged_attention( # NOTE(zhaoyang): FP8 KV Cache will introduce quantization error, # so we use a relaxed tolerance for the test. atol, rtol = 1e-3, 1e-5 - if kv_cache_dtype == "fp8_e5m2": + if kv_cache_dtype == "fp8": atol, rtol = 1e-2, 1e-5 assert torch.allclose(output, ref_output, atol=atol, rtol=rtol) diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index f96c5b04c057e..6db2c81f7aeaa 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -18,7 +18,7 @@ NUM_MAPPINGS = [256] # Arbitrary values for testing SEEDS = [0] DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)] -KV_CACHE_DTYPE = ["auto", "fp8_e5m2"] +KV_CACHE_DTYPE = ["auto", "fp8"] @pytest.mark.parametrize("num_mappings", NUM_MAPPINGS) @@ -137,7 +137,7 @@ def test_reshape_and_cache( key_cache, value_cache = key_caches[0], value_caches[0] # Clone the KV caches. - if kv_cache_dtype == "fp8_e5m2": + if kv_cache_dtype == "fp8": cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16) cache_ops.convert_fp8(key_cache, cloned_key_cache) cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16) @@ -150,7 +150,7 @@ def test_reshape_and_cache( cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype) - if kv_cache_dtype == "fp8_e5m2": + if kv_cache_dtype == "fp8": result_key_cache = torch.empty_like(key_cache, dtype=torch.float16) cache_ops.convert_fp8(key_cache, result_key_cache) result_value_cache = torch.empty_like(value_cache, dtype=torch.float16) @@ -168,7 +168,7 @@ def test_reshape_and_cache( cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i] cloned_value_cache[block_idx, :, :, block_offset] = value[i] - if kv_cache_dtype == "fp8_e5m2": + if kv_cache_dtype == "fp8": assert torch.allclose(result_key_cache, cloned_key_cache, atol=0.001, rtol=0.1) assert torch.allclose(result_value_cache, cloned_value_cache, atol=0.001, rtol=0.1) else: @@ -200,7 +200,7 @@ def test_swap_blocks( device: int, kv_cache_dtype: str, ) -> None: - if kv_cache_dtype == "fp8_e5m2" and "cpu" in direction: + if kv_cache_dtype == "fp8" and "cpu" in direction: return random.seed(seed) torch.random.manual_seed(seed) @@ -267,8 +267,8 @@ def test_fp8_conversion( torch.cuda.manual_seed(seed) gpu_id = f"cuda:{device}" - low = -240.0 - high = 240.0 + low = -224.0 + high = 224.0 shape = (num_blocks, num_heads, head_size, block_size) cache = torch.empty(shape, dtype=dtype, device=gpu_id) cache.uniform_(low, high) diff --git a/vllm/config.py b/vllm/config.py index 4fb7357a3da21..2addcb67e15d6 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -308,18 +308,15 @@ def _verify_args(self) -> None: def _verify_cache_dtype(self) -> None: if self.cache_dtype == "auto": pass - elif self.cache_dtype == "fp8_e5m2": - nvcc_cuda_version = get_nvcc_cuda_version() - if nvcc_cuda_version < Version("11.8"): - raise ValueError( - "FP8 is not supported when cuda version is lower than 11.8." - ) - device_name = torch.cuda.get_device_name() - if "AMD" in device_name: - raise NotImplementedError( - "FP8_E5M2 KV Cache on AMD GPU has not been supported yet.") + elif self.cache_dtype == "fp8": + if not is_hip(): + nvcc_cuda_version = get_nvcc_cuda_version() + if nvcc_cuda_version < Version("11.8"): + raise ValueError( + "FP8 is not supported when cuda version is lower than 11.8." + ) logger.info( - "Using fp8_e5m2 data type to store kv cache. It reduces " + "Using fp8 data type to store kv cache. It reduces " "the GPU memory footprint and boosts the performance. " "But it may cause slight accuracy drop. " "Currently we only support fp8 without scaling factors and " diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 231ce3321cdc4..66dcc993af355 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -126,7 +126,7 @@ def add_cli_args( parser.add_argument( '--kv-cache-dtype', type=str, - choices=['auto', 'fp8_e5m2'], + choices=['auto', 'fp8'], default='auto', help='Data type for kv cache storage. If "auto", will use model ' 'data type. Note FP8 is not supported when cuda version is ' diff --git a/vllm/utils.py b/vllm/utils.py index cc8a37ab7488a..8d3923cc5b0ad 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -28,7 +28,7 @@ "half": torch.half, "bfloat16": torch.bfloat16, "float": torch.float, - "fp8_e5m2": torch.uint8, + "fp8": torch.uint8, } @@ -196,7 +196,7 @@ def get_nvcc_cuda_version() -> Version: return nvcc_cuda_version -def _generate_random_fp8_e5m2( +def _generate_random_fp8( tensor: torch.tensor, low: float, high: float, @@ -240,7 +240,7 @@ def create_kv_caches_with_random( raise ValueError(f"Invalid model dtype: {model_dtype}") elif cache_dtype in ["half", "bfloat16", "float"]: torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype] - elif cache_dtype == "fp8_e5m2": + elif cache_dtype == "fp8": torch_dtype = torch.uint8 else: raise ValueError(f"Invalid kv cache dtype: {cache_dtype}") @@ -259,8 +259,8 @@ def create_kv_caches_with_random( device=device) if cache_dtype in ["auto", "half", "bfloat16", "float"]: key_cache.uniform_(-scale, scale) - elif cache_dtype == 'fp8_e5m2': - _generate_random_fp8_e5m2(key_cache, -scale, scale) + elif cache_dtype == 'fp8': + _generate_random_fp8(key_cache, -scale, scale) key_caches.append(key_cache) value_cache_shape = (num_blocks, num_heads, head_size, block_size) @@ -271,7 +271,7 @@ def create_kv_caches_with_random( device=device) if cache_dtype in ["auto", "half", "bfloat16", "float"]: value_cache.uniform_(-scale, scale) - elif cache_dtype == 'fp8_e5m2': - _generate_random_fp8_e5m2(value_cache, -scale, scale) + elif cache_dtype == 'fp8': + _generate_random_fp8(value_cache, -scale, scale) value_caches.append(value_cache) return key_caches, value_caches From a2897d6689acebcf052c506bba950b856fed2977 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Fri, 9 Feb 2024 15:34:40 +0000 Subject: [PATCH 013/159] Fix or comparisons --- csrc/cache_kernels.cu | 2 +- vllm/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 56745e630920e..03b3cebe4c8fa 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -259,7 +259,7 @@ void reshape_and_cache( } else if (key.dtype() == at::ScalarType::BFloat16) { CALL_RESHAPE_AND_CACHE(__nv_bfloat16, __nv_bfloat16, false); } - } else if (kv_cache_dtype == "fp8_e5m2" or "fp8_e4m3") { + } else if (kv_cache_dtype == "fp8_e5m2" || kv_cache_dtype == "fp8_e4m3") { if (key.dtype() == at::ScalarType::Float) { CALL_RESHAPE_AND_CACHE(float, uint8_t, true); } else if (key.dtype() == at::ScalarType::Half) { diff --git a/vllm/utils.py b/vllm/utils.py index 1b66a13cb5518..5c9c064979412 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -241,7 +241,7 @@ def create_kv_caches_with_random( raise ValueError(f"Invalid model dtype: {model_dtype}") elif cache_dtype in ["half", "bfloat16", "float"]: torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype] - elif cache_dtype == "fp8_e5m2" or "fp8_e4m3": + elif cache_dtype in ["fp8_e5m2", "fp8_e4m3"]: torch_dtype = torch.uint8 else: raise ValueError(f"Invalid kv cache dtype: {cache_dtype}") From a06ddac922ac1aa292200fc9d36aac82842bcb8d Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Fri, 9 Feb 2024 15:48:50 +0000 Subject: [PATCH 014/159] Add e4m3 to attention kernels --- csrc/attention/attention_kernels.cu | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index 6aec56bab4b28..bdba84a4b858b 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -25,8 +25,10 @@ #include "attention_dtypes.h" #include "attention_utils.cuh" -#ifdef ENABLE_FP8_E5M2 +#if defined(ENABLE_FP8_E5M2) #include "../quantization/fp8_e5m2_kvcache/quant_utils.cuh" +#elif defined(ENABLE_FP8_E4M3) +#include "../quantization/fp8/amd_detail/quant_utils.cuh" #endif #include @@ -150,7 +152,7 @@ __device__ void paged_attention_kernel( constexpr int VEC_SIZE = MAX(16 / (THREAD_GROUP_SIZE * sizeof(scalar_t)), 1); using K_vec = typename Vec::Type; using Q_vec = typename Vec::Type; -#ifdef ENABLE_FP8_E5M2 +#if defined(ENABLE_FP8_E5M2) || defined(ENABLE_FP8_E4M3) using Quant_vec = typename Vec::Type; #endif @@ -217,10 +219,14 @@ __device__ void paged_attention_kernel( const int offset1 = (vec_idx * VEC_SIZE) / x; const int offset2 = (vec_idx * VEC_SIZE) % x; if constexpr (IS_FP8_E5M2_KV_CACHE) { -#ifdef ENABLE_FP8_E5M2 +#if defined(ENABLE_FP8_E5M2) Quant_vec k_vec_quant = *reinterpret_cast(k_ptr + offset1 * BLOCK_SIZE * x + offset2); // Vector conversion from Quant_vec to K_vec. k_vecs[j] = fp8_e5m2_unscaled::vec_conversion(k_vec_quant); +#elif defined(ENABLE_FP8_E4M3) + Quant_vec k_vec_quant = *reinterpret_cast(k_ptr + offset1 * BLOCK_SIZE * x + offset2); + // Vector conversion from Quant_vec to K_vec. + k_vecs[j] = fp8_e4m3::vec_conversion(k_vec_quant); #else assert(false); #endif @@ -300,7 +306,7 @@ __device__ void paged_attention_kernel( constexpr int V_VEC_SIZE = MIN(16 / sizeof(scalar_t), BLOCK_SIZE); using V_vec = typename Vec::Type; using L_vec = typename Vec::Type; -#ifdef ENABLE_FP8_E5M2 +#if defined(ENABLE_FP8_E5M2) || defined(ENABLE_FP8_E4M3) using V_quant_vec = typename Vec::Type; #endif using Float_L_vec = typename FloatVec::Type; @@ -337,10 +343,14 @@ __device__ void paged_attention_kernel( const int offset = row_idx * BLOCK_SIZE + physical_block_offset; V_vec v_vec; if constexpr (IS_FP8_E5M2_KV_CACHE) { -#ifdef ENABLE_FP8_E5M2 +#if defined(ENABLE_FP8_E5M2) V_quant_vec v_quant_vec = *reinterpret_cast(v_ptr + offset); // Vector conversion from V_quant_vec to V_vec. v_vec = fp8_e5m2_unscaled::vec_conversion(v_quant_vec); +#elif defined(ENABLE_FP8_E4M3) + V_quant_vec v_quant_vec = *reinterpret_cast(v_ptr + offset); + // Vector conversion from V_quant_vec to V_vec. + v_vec = fp8_e4m3::vec_conversion(v_quant_vec); #else assert(false); #endif From f358dcd752f20482172b90058c473b3bc188c01a Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Fri, 9 Feb 2024 16:15:56 +0000 Subject: [PATCH 015/159] Remove remaining mentions of e5m2 where it refers to general fp8 --- csrc/attention/attention_kernels.cu | 48 ++++++++++++++--------------- csrc/cache_kernels.cu | 8 ++--- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index bdba84a4b858b..890f50e68349a 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -88,7 +88,7 @@ template< int HEAD_SIZE, int BLOCK_SIZE, int NUM_THREADS, - bool IS_FP8_E5M2_KV_CACHE, + bool IS_FP8_KV_CACHE, int PARTITION_SIZE = 0> // Zero means no partitioning. __device__ void paged_attention_kernel( float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] @@ -218,7 +218,7 @@ __device__ void paged_attention_kernel( const int vec_idx = thread_group_offset + j * THREAD_GROUP_SIZE; const int offset1 = (vec_idx * VEC_SIZE) / x; const int offset2 = (vec_idx * VEC_SIZE) % x; - if constexpr (IS_FP8_E5M2_KV_CACHE) { + if constexpr (IS_FP8_KV_CACHE) { #if defined(ENABLE_FP8_E5M2) Quant_vec k_vec_quant = *reinterpret_cast(k_ptr + offset1 * BLOCK_SIZE * x + offset2); // Vector conversion from Quant_vec to K_vec. @@ -342,7 +342,7 @@ __device__ void paged_attention_kernel( if (row_idx < HEAD_SIZE) { const int offset = row_idx * BLOCK_SIZE + physical_block_offset; V_vec v_vec; - if constexpr (IS_FP8_E5M2_KV_CACHE) { + if constexpr (IS_FP8_KV_CACHE) { #if defined(ENABLE_FP8_E5M2) V_quant_vec v_quant_vec = *reinterpret_cast(v_ptr + offset); // Vector conversion from V_quant_vec to V_vec. @@ -441,7 +441,7 @@ template< int HEAD_SIZE, int BLOCK_SIZE, int NUM_THREADS, - bool IS_FP8_E5M2_KV_CACHE> + bool IS_FP8_KV_CACHE> __global__ void paged_attention_v1_kernel( scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size] const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] @@ -456,7 +456,7 @@ __global__ void paged_attention_v1_kernel( const int q_stride, const int kv_block_stride, const int kv_head_stride) { - paged_attention_kernel( + paged_attention_kernel( /* exp_sums */ nullptr, /* max_logits */ nullptr, out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, context_lens, max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride); @@ -469,7 +469,7 @@ template< int HEAD_SIZE, int BLOCK_SIZE, int NUM_THREADS, - bool IS_FP8_E5M2_KV_CACHE, + bool IS_FP8_KV_CACHE, int PARTITION_SIZE> __global__ void paged_attention_v2_kernel( float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] @@ -487,7 +487,7 @@ __global__ void paged_attention_v2_kernel( const int q_stride, const int kv_block_stride, const int kv_head_stride) { - paged_attention_kernel( + paged_attention_kernel( exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, context_lens, max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride); @@ -597,9 +597,9 @@ __global__ void paged_attention_v2_reduce_kernel( #define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \ VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \ ((void*)vllm::paged_attention_v1_kernel), shared_mem_size); \ + IS_FP8_KV_CACHE>), shared_mem_size); \ vllm::paged_attention_v1_kernel<<>>( \ + IS_FP8_KV_CACHE><<>>( \ out_ptr, \ query_ptr, \ key_cache_ptr, \ @@ -619,7 +619,7 @@ template< typename T, typename CACHE_T, int BLOCK_SIZE, - bool IS_FP8_E5M2_KV_CACHE, + bool IS_FP8_KV_CACHE, int NUM_THREADS = 128> void paged_attention_v1_launcher( torch::Tensor& out, @@ -695,8 +695,8 @@ void paged_attention_v1_launcher( } } -#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_E5M2_KV_CACHE) \ - paged_attention_v1_launcher( \ +#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ + paged_attention_v1_launcher( \ out, \ query, \ key_cache, \ @@ -710,16 +710,16 @@ void paged_attention_v1_launcher( // NOTE(woosuk): To reduce the compilation time, we omitted block sizes // 1, 2, 4, 64, 128, 256. -#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_E5M2_KV_CACHE) \ +#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_KV_CACHE) \ switch (block_size) { \ case 8: \ - CALL_V1_LAUNCHER(T, CACHE_T, 8, IS_FP8_E5M2_KV_CACHE); \ + CALL_V1_LAUNCHER(T, CACHE_T, 8, IS_FP8_KV_CACHE); \ break; \ case 16: \ - CALL_V1_LAUNCHER(T, CACHE_T, 16, IS_FP8_E5M2_KV_CACHE); \ + CALL_V1_LAUNCHER(T, CACHE_T, 16, IS_FP8_KV_CACHE); \ break; \ case 32: \ - CALL_V1_LAUNCHER(T, CACHE_T, 32, IS_FP8_E5M2_KV_CACHE); \ + CALL_V1_LAUNCHER(T, CACHE_T, 32, IS_FP8_KV_CACHE); \ break; \ default: \ TORCH_CHECK(false, "Unsupported block size: ", block_size); \ @@ -766,7 +766,7 @@ void paged_attention_v1( #define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \ vllm::paged_attention_v2_kernel \ + IS_FP8_KV_CACHE, PARTITION_SIZE> \ <<>>( \ exp_sums_ptr, \ max_logits_ptr, \ @@ -796,7 +796,7 @@ template< typename T, typename CACHE_T, int BLOCK_SIZE, - bool IS_FP8_E5M2_KV_CACHE, + bool IS_FP8_KV_CACHE, int NUM_THREADS = 128, int PARTITION_SIZE = 512> void paged_attention_v2_launcher( @@ -882,8 +882,8 @@ void paged_attention_v2_launcher( } } -#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_E5M2_KV_CACHE) \ - paged_attention_v2_launcher( \ +#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ + paged_attention_v2_launcher( \ out, \ exp_sums, \ max_logits, \ @@ -900,16 +900,16 @@ void paged_attention_v2_launcher( // NOTE(woosuk): To reduce the compilation time, we omitted block sizes // 1, 2, 4, 64, 128, 256. -#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_E5M2_KV_CACHE) \ +#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_KV_CACHE) \ switch (block_size) { \ case 8: \ - CALL_V2_LAUNCHER(T, CACHE_T, 8, IS_FP8_E5M2_KV_CACHE); \ + CALL_V2_LAUNCHER(T, CACHE_T, 8, IS_FP8_KV_CACHE); \ break; \ case 16: \ - CALL_V2_LAUNCHER(T, CACHE_T, 16, IS_FP8_E5M2_KV_CACHE); \ + CALL_V2_LAUNCHER(T, CACHE_T, 16, IS_FP8_KV_CACHE); \ break; \ case 32: \ - CALL_V2_LAUNCHER(T, CACHE_T, 32, IS_FP8_E5M2_KV_CACHE); \ + CALL_V2_LAUNCHER(T, CACHE_T, 32, IS_FP8_KV_CACHE); \ break; \ default: \ TORCH_CHECK(false, "Unsupported block size: ", block_size); \ diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 15b75c9acdae2..84f47ba458131 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -153,7 +153,7 @@ void copy_blocks( namespace vllm { -template +template __global__ void reshape_and_cache_kernel( const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size] const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size] @@ -197,7 +197,7 @@ __global__ void reshape_and_cache_kernel( + block_offset; scalar_t tgt_key = key[src_key_idx]; scalar_t tgt_value = value[src_value_idx]; - if constexpr (is_fp8_e5m2_kv_cache) { + if constexpr (is_fp8_kv_cache) { #if defined(ENABLE_FP8_E5M2) key_cache[tgt_key_idx] = fp8_e5m2_unscaled::vec_conversion(tgt_key); value_cache[tgt_value_idx] = fp8_e5m2_unscaled::vec_conversion(tgt_value); @@ -216,8 +216,8 @@ __global__ void reshape_and_cache_kernel( } // namespace vllm -#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, IS_FP8_E5M2_KV_CACHE) \ - vllm::reshape_and_cache_kernel<<>>( \ +#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, IS_FP8_KV_CACHE) \ + vllm::reshape_and_cache_kernel<<>>( \ reinterpret_cast(key.data_ptr()), \ reinterpret_cast(value.data_ptr()), \ reinterpret_cast(key_cache.data_ptr()), \ From 4db00389ad9f0b4552ce025a73e0362bb3125a2e Mon Sep 17 00:00:00 2001 From: AdrianAbeyta Date: Fri, 9 Feb 2024 17:09:48 +0000 Subject: [PATCH 016/159] Address naming conventions --- csrc/attention/attention_kernels.cu | 64 +++++++++++++++-------------- csrc/cache_kernels.cu | 6 +-- vllm/engine/arg_utils.py | 7 ++-- 3 files changed, 41 insertions(+), 36 deletions(-) diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index 27b542fedcb7e..0719ec7d796fb 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -25,16 +25,20 @@ #include "attention_dtypes.h" #include "attention_utils.cuh" -#ifdef ENABLE_FP8_E5M2 -#include "../quantization/fp8_e5m2_kvcache/quant_utils.cuh" -#endif -#ifdef ENABLE_FP8_E4M3 +#if defined(ENABLE_FP8_E5M2) +#include "../quantization/fp8_e5m2_kvcache/quant_utils.cuh" +#elif defined(ENABLE_FP8_E4M3) #include "../quantization/fp8/amd_detail/quant_utils.cuh" #endif #include +#ifdef USE_ROCM + #include + typedef __hip_bfloat16 __nv_bfloat16; +#endif + #ifndef USE_ROCM #define WARP_SIZE 32 #else @@ -90,7 +94,7 @@ template< int HEAD_SIZE, int BLOCK_SIZE, int NUM_THREADS, - bool IS_FP8_E5M2_KV_CACHE, + bool IS_FP8_KV_CACHE, int PARTITION_SIZE = 0> // Zero means no partitioning. __device__ void paged_attention_kernel( float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] @@ -220,8 +224,8 @@ __device__ void paged_attention_kernel( const int vec_idx = thread_group_offset + j * THREAD_GROUP_SIZE; const int offset1 = (vec_idx * VEC_SIZE) / x; const int offset2 = (vec_idx * VEC_SIZE) % x; - if constexpr (IS_FP8_E5M2_KV_CACHE) { -#ifdef ENABLE_FP8_E5M2 + if constexpr (IS_FP8_KV_CACHE) { +#if defined(ENABLE_FP8_E5M2) Quant_vec k_vec_quant = *reinterpret_cast(k_ptr + offset1 * BLOCK_SIZE * x + offset2); // Vector conversion from Quant_vec to K_vec. k_vecs[j] = fp8_e5m2_unscaled::vec_conversion(k_vec_quant); @@ -343,8 +347,8 @@ __device__ void paged_attention_kernel( if (row_idx < HEAD_SIZE) { const int offset = row_idx * BLOCK_SIZE + physical_block_offset; V_vec v_vec; - if constexpr (IS_FP8_E5M2_KV_CACHE) { -#ifdef ENABLE_FP8_E5M2 + if constexpr (IS_FP8_KV_CACHE) { +#if defined(ENABLE_FP8_E5M2) V_quant_vec v_quant_vec = *reinterpret_cast(v_ptr + offset); // Vector conversion from V_quant_vec to V_vec. v_vec = fp8_e5m2_unscaled::vec_conversion(v_quant_vec); @@ -441,7 +445,7 @@ template< int HEAD_SIZE, int BLOCK_SIZE, int NUM_THREADS, - bool IS_FP8_E5M2_KV_CACHE> + bool IS_FP8_KV_CACHE> __global__ void paged_attention_v1_kernel( scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size] const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] @@ -456,7 +460,7 @@ __global__ void paged_attention_v1_kernel( const int q_stride, const int kv_block_stride, const int kv_head_stride) { - paged_attention_kernel( + paged_attention_kernel( /* exp_sums */ nullptr, /* max_logits */ nullptr, out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, context_lens, max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride); @@ -469,7 +473,7 @@ template< int HEAD_SIZE, int BLOCK_SIZE, int NUM_THREADS, - bool IS_FP8_E5M2_KV_CACHE, + bool IS_FP8_KV_CACHE, int PARTITION_SIZE> __global__ void paged_attention_v2_kernel( float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] @@ -487,7 +491,7 @@ __global__ void paged_attention_v2_kernel( const int q_stride, const int kv_block_stride, const int kv_head_stride) { - paged_attention_kernel( + paged_attention_kernel( exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, context_lens, max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride); @@ -597,9 +601,9 @@ __global__ void paged_attention_v2_reduce_kernel( #define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \ VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \ ((void*)vllm::paged_attention_v1_kernel), shared_mem_size); \ + IS_FP8_KV_CACHE>), shared_mem_size); \ vllm::paged_attention_v1_kernel<<>>( \ + IS_FP8_KV_CACHE><<>>( \ out_ptr, \ query_ptr, \ key_cache_ptr, \ @@ -619,7 +623,7 @@ template< typename T, typename CACHE_T, int BLOCK_SIZE, - bool IS_FP8_E5M2_KV_CACHE, + bool IS_FP8_KV_CACHE, int NUM_THREADS = 128> void paged_attention_v1_launcher( torch::Tensor& out, @@ -695,8 +699,8 @@ void paged_attention_v1_launcher( } } -#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_E5M2_KV_CACHE) \ - paged_attention_v1_launcher( \ +#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ + paged_attention_v1_launcher( \ out, \ query, \ key_cache, \ @@ -710,16 +714,16 @@ void paged_attention_v1_launcher( // NOTE(woosuk): To reduce the compilation time, we omitted block sizes // 1, 2, 4, 64, 128, 256. -#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_E5M2_KV_CACHE) \ +#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_KV_CACHE) \ switch (block_size) { \ case 8: \ - CALL_V1_LAUNCHER(T, CACHE_T, 8, IS_FP8_E5M2_KV_CACHE); \ + CALL_V1_LAUNCHER(T, CACHE_T, 8, IS_FP8_KV_CACHE); \ break; \ case 16: \ - CALL_V1_LAUNCHER(T, CACHE_T, 16, IS_FP8_E5M2_KV_CACHE); \ + CALL_V1_LAUNCHER(T, CACHE_T, 16, IS_FP8_KV_CACHE); \ break; \ case 32: \ - CALL_V1_LAUNCHER(T, CACHE_T, 32, IS_FP8_E5M2_KV_CACHE); \ + CALL_V1_LAUNCHER(T, CACHE_T, 32, IS_FP8_KV_CACHE); \ break; \ default: \ TORCH_CHECK(false, "Unsupported block size: ", block_size); \ @@ -766,7 +770,7 @@ void paged_attention_v1( #define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \ vllm::paged_attention_v2_kernel \ + IS_FP8_KV_CACHE, PARTITION_SIZE> \ <<>>( \ exp_sums_ptr, \ max_logits_ptr, \ @@ -796,7 +800,7 @@ template< typename T, typename CACHE_T, int BLOCK_SIZE, - bool IS_FP8_E5M2_KV_CACHE, + bool IS_FP8_KV_CACHE, int NUM_THREADS = 128, int PARTITION_SIZE = 512> void paged_attention_v2_launcher( @@ -882,8 +886,8 @@ void paged_attention_v2_launcher( } } -#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_E5M2_KV_CACHE) \ - paged_attention_v2_launcher( \ +#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ + paged_attention_v2_launcher( \ out, \ exp_sums, \ max_logits, \ @@ -900,16 +904,16 @@ void paged_attention_v2_launcher( // NOTE(woosuk): To reduce the compilation time, we omitted block sizes // 1, 2, 4, 64, 128, 256. -#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_E5M2_KV_CACHE) \ +#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_KV_CACHE) \ switch (block_size) { \ case 8: \ - CALL_V2_LAUNCHER(T, CACHE_T, 8, IS_FP8_E5M2_KV_CACHE); \ + CALL_V2_LAUNCHER(T, CACHE_T, 8, IS_FP8_KV_CACHE); \ break; \ case 16: \ - CALL_V2_LAUNCHER(T, CACHE_T, 16, IS_FP8_E5M2_KV_CACHE); \ + CALL_V2_LAUNCHER(T, CACHE_T, 16, IS_FP8_KV_CACHE); \ break; \ case 32: \ - CALL_V2_LAUNCHER(T, CACHE_T, 32, IS_FP8_E5M2_KV_CACHE); \ + CALL_V2_LAUNCHER(T, CACHE_T, 32, IS_FP8_KV_CACHE); \ break; \ default: \ TORCH_CHECK(false, "Unsupported block size: ", block_size); \ diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 56745e630920e..e27d7fab48031 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -6,7 +6,7 @@ #include "dispatch_utils.h" #if defined(ENABLE_FP8_E5M2) #include "quantization/fp8_e5m2_kvcache/quant_utils.cuh" -#else if defined(ENABLE_FP8_E4M3) +#elif defined(ENABLE_FP8_E4M3) #include "quantization/fp8/amd_detail/quant_utils.cuh" #endif @@ -216,8 +216,8 @@ __global__ void reshape_and_cache_kernel( } // namespace vllm -#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, IS_FP8_E5M2_KV_CACHE) \ - vllm::reshape_and_cache_kernel<<>>( \ +#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, IS_FP8_KV_CACHE) \ + vllm::reshape_and_cache_kernel<<>>( \ reinterpret_cast(key.data_ptr()), \ reinterpret_cast(value.data_ptr()), \ reinterpret_cast(key_cache.data_ptr()), \ diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 46d0a52c5f97f..6d9ae5ce0cd9d 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -128,9 +128,10 @@ def add_cli_args( type=str, choices=['auto', 'fp8_e5m2', 'fp8_e4m3'], default='auto', - help='Data type for kv cache storage. If "auto", will use model ' - 'data type. Note FP8 is not supported when cuda version is ' - 'lower than 11.8.') + help='Data type for kv cache storage. If "auto", will use model data ' + 'type. FP8_E5M2 is only supported on cuda version greater than 11.8. ' + 'On AMD GPUs, only a more standard FP8_E4M3 is supported for inference. ') + parser.add_argument('--max-model-len', type=int, default=None, From 17a91a01cfdfdb75804e3dbfd2f53b8fb57d6562 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Fri, 9 Feb 2024 18:09:46 +0000 Subject: [PATCH 017/159] More verbose help message for fp8 cache type --- vllm/engine/arg_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 66dcc993af355..f8708ea83a6b9 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -128,9 +128,9 @@ def add_cli_args( type=str, choices=['auto', 'fp8'], default='auto', - help='Data type for kv cache storage. If "auto", will use model ' - 'data type. Note FP8 is not supported when cuda version is ' - 'lower than 11.8.') + help='Data type for kv cache storage. If "auto", will use model data ' + 'type. FP8_E5M2 is only supported on cuda version greater than 11.8. ' + 'On AMD GPUs, only a more standard FP8_E4M3 is supported for inference.') parser.add_argument('--max-model-len', type=int, default=None, From 4fbc915bbf672a3138ed61a4f6e22adc97972a6d Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Fri, 9 Feb 2024 18:42:43 +0000 Subject: [PATCH 018/159] Updated fp8 help text in additional files sililar to arg_utils --- benchmarks/benchmark_latency.py | 5 +++-- benchmarks/benchmark_throughput.py | 5 +++-- benchmarks/kernels/benchmark_paged_attention.py | 5 +++-- vllm/engine/arg_utils.py | 2 +- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 30df87d1419a8..66938793276b4 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -123,8 +123,9 @@ def run_to_completion(profile_dir: Optional[str] = None): type=str, choices=['auto', 'fp8'], default='auto', - help= - 'Data type for kv cache storage. If "auto", will use model data type.') + help='Data type for kv cache storage. If "auto", will use model data ' + 'type. FP8_E5M2 is only supported on cuda version greater than 11.8. ' + 'On AMD GPUs, only the more standard FP8_E4M3 is supported for inference.') parser.add_argument( '--profile', action='store_true', diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 83b1182256c8d..13684f829ae74 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -292,8 +292,9 @@ def main(args: argparse.Namespace): type=str, choices=["auto", "fp8"], default="auto", - help= - 'Data type for kv cache storage. If "auto", will use model data type.') + help='Data type for kv cache storage. If "auto", will use model data ' + 'type. FP8_E5M2 is only supported on cuda version greater than 11.8. ' + 'On AMD GPUs, only the more standard FP8_E4M3 is supported for inference.') args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index da49d84d97005..472dc444b2c52 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -173,8 +173,9 @@ def run_benchmark(num_iters: int, profile: bool = False) -> float: type=str, choices=["auto", "fp8"], default="auto", - help= - 'Data type for kv cache storage. If "auto", will use model data type.') + help='Data type for kv cache storage. If "auto", will use model data ' + 'type. FP8_E5M2 is only supported on cuda version greater than 11.8. ' + 'On AMD GPUs, only the more standard FP8_E4M3 is supported for inference.') args = parser.parse_args() print(args) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index f8708ea83a6b9..4f6d7b5e3d97b 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -130,7 +130,7 @@ def add_cli_args( default='auto', help='Data type for kv cache storage. If "auto", will use model data ' 'type. FP8_E5M2 is only supported on cuda version greater than 11.8. ' - 'On AMD GPUs, only a more standard FP8_E4M3 is supported for inference.') + 'On AMD GPUs, only the more standard FP8_E4M3 is supported for inference.') parser.add_argument('--max-model-len', type=int, default=None, From 2c525debe41122e0eefcbbf1d0bb227ee810d513 Mon Sep 17 00:00:00 2001 From: AdrianAbeyta Date: Fri, 9 Feb 2024 19:34:33 +0000 Subject: [PATCH 019/159] Fix merge conflict --- csrc/attention/attention_kernels.cu | 4 ---- 1 file changed, 4 deletions(-) diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index dda8d100c21f3..602e2e496af91 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -948,11 +948,7 @@ void paged_attention_v2( } else { TORCH_CHECK(false, "Unsupported data type: ", query.dtype()); } -<<<<<<< HEAD - } else if (kv_cache_dtype == "fp8_e5m2" or "fp8_e4m3") { -======= } else if (kv_cache_dtype == "fp8") { ->>>>>>> 7f5623d1f7bd39c05e97dd80ba3e121c0473d51c if (query.dtype() == at::ScalarType::Float) { CALL_V2_LAUNCHER_BLOCK_SIZE(float, uint8_t, true); } else if (query.dtype() == at::ScalarType::Half) { From 54d1d4d4990da052e37a62b26a66ed33bda320cd Mon Sep 17 00:00:00 2001 From: AdrianAbeyta Date: Fri, 9 Feb 2024 19:48:04 +0000 Subject: [PATCH 020/159] generalize fp8 convention --- vllm/config.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 1d020abba159c..2d65ce076084b 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -319,18 +319,8 @@ def _verify_cache_dtype(self) -> None: "Using fp8 data type to store kv cache. It reduces " "the GPU memory footprint and boosts the performance. " "But it may cause slight accuracy drop. " - "Currently we only support fp8 without scaling factors and " - "make e5m2 as a default format.") - elif self.cache_dtype == "fp8_e4m3": - device_name = torch.cuda.get_device_name() - if not "AMD" in device_name: - raise NotImplementedError( - "FP8_E4M3 KV Cache on NVIDIA GPU has not been supported yet.") - logger.info( - "Using fp8_e4m3 data type to store kv cache. It reduces " - "the GPU memory footprint and boosts the performance. " - "But it may cause slight accuracy drop. " - "Currently we only support fp8 without scaling factors") + "FP8_E5M2 is only supported on cuda version greater than 11.8." + "On AMD GPUs, only a more standard FP8_E4M3 is supported for inference.") else: raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}") From 7a9db00323aa4d7b68ddc9cb7674da6e5f0acadb Mon Sep 17 00:00:00 2001 From: HaiShaw Date: Fri, 9 Feb 2024 16:18:48 -0800 Subject: [PATCH 021/159] Update log info and args description w.r.t. FP8 KV cache. --- benchmarks/benchmark_latency.py | 6 +++--- benchmarks/benchmark_throughput.py | 6 +++--- benchmarks/kernels/benchmark_paged_attention.py | 6 +++--- vllm/config.py | 6 +++--- vllm/engine/arg_utils.py | 6 +++--- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 66938793276b4..1a910d9775fa2 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -123,9 +123,9 @@ def run_to_completion(profile_dir: Optional[str] = None): type=str, choices=['auto', 'fp8'], default='auto', - help='Data type for kv cache storage. If "auto", will use model data ' - 'type. FP8_E5M2 is only supported on cuda version greater than 11.8. ' - 'On AMD GPUs, only the more standard FP8_E4M3 is supported for inference.') + help='Data type for kv cache storage. If "auto", will use model data type. ' + 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' + 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') parser.add_argument( '--profile', action='store_true', diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 13684f829ae74..6102d360ee9a1 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -292,9 +292,9 @@ def main(args: argparse.Namespace): type=str, choices=["auto", "fp8"], default="auto", - help='Data type for kv cache storage. If "auto", will use model data ' - 'type. FP8_E5M2 is only supported on cuda version greater than 11.8. ' - 'On AMD GPUs, only the more standard FP8_E4M3 is supported for inference.') + help='Data type for kv cache storage. If "auto", will use model data type. ' + 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' + 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 472dc444b2c52..ce103af3240a8 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -173,9 +173,9 @@ def run_benchmark(num_iters: int, profile: bool = False) -> float: type=str, choices=["auto", "fp8"], default="auto", - help='Data type for kv cache storage. If "auto", will use model data ' - 'type. FP8_E5M2 is only supported on cuda version greater than 11.8. ' - 'On AMD GPUs, only the more standard FP8_E4M3 is supported for inference.') + help='Data type for kv cache storage. If "auto", will use model data type. ' + 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' + 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') args = parser.parse_args() print(args) diff --git a/vllm/config.py b/vllm/config.py index 2d65ce076084b..d4cb6402c7269 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -318,9 +318,9 @@ def _verify_cache_dtype(self) -> None: logger.info( "Using fp8 data type to store kv cache. It reduces " "the GPU memory footprint and boosts the performance. " - "But it may cause slight accuracy drop. " - "FP8_E5M2 is only supported on cuda version greater than 11.8." - "On AMD GPUs, only a more standard FP8_E4M3 is supported for inference.") + "But it may cause slight accuracy drop without scaling factors. " + "FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8." + "On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.") else: raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}") diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 4f6d7b5e3d97b..f4c6994c1f69e 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -128,9 +128,9 @@ def add_cli_args( type=str, choices=['auto', 'fp8'], default='auto', - help='Data type for kv cache storage. If "auto", will use model data ' - 'type. FP8_E5M2 is only supported on cuda version greater than 11.8. ' - 'On AMD GPUs, only the more standard FP8_E4M3 is supported for inference.') + help='Data type for kv cache storage. If "auto", will use model data type. ' + 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' + 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') parser.add_argument('--max-model-len', type=int, default=None, From 20b5f1066dc1424929cef39a7a7b5abcc753bd68 Mon Sep 17 00:00:00 2001 From: Doug Lehr Date: Sat, 10 Feb 2024 17:42:42 +0000 Subject: [PATCH 022/159] Initial port of gradlib gemm tuner --- run.sh | 32 ++++++++ vllm/model_executor/layers/linear.py | 11 ++- vllm/model_executor/layers/tuned_gemm.py | 93 ++++++++++++++++++++++++ 3 files changed, 134 insertions(+), 2 deletions(-) create mode 100755 run.sh create mode 100644 vllm/model_executor/layers/tuned_gemm.py diff --git a/run.sh b/run.sh new file mode 100755 index 0000000000000..7b9336a0a076a --- /dev/null +++ b/run.sh @@ -0,0 +1,32 @@ +#!/bin/bash +BASE_DIR=/trees/ +VLLM_DIR=$BASE_DIR/vllm +GRAD_DIR=$BASE_DIR/gradlib +RPD_DIR=/workspace/rocmProfileData +MODEL=/data/llama2-70b-chat +#MODEL=/data/Llama-2-13B-Chat-fp16 +#MODEL=/data/llama-2-13b-chat-hf +MODEL_SIZE=`echo $MODEL | sed 's/.*\(.[0-9][bB]\).*/\1/'` + +GEN_LEN="8" +TP=8 +INPUT_LEN=2048 +ITER=1 +cd $VLLM_DIR + + echo "tuned_gemm_csv: ./tuned_tp$TP.csv" > $VLLM_DIR/tuned_perf_tp$TP.yaml + tuned_file=$VLLM_DIR/tuned_tp$TP.csv +export VLLM_PERF_YAML=./tuned_perf_tp$TP.yaml + +for tp in $TP; +do + for gen_len in $GEN_LEN; + do + for input_len in $INPUT_LEN; + do + +python benchmarks/benchmark_latency.py --model $MODEL --batch-size 1 --input-len $input_len --output-len $gen_len \ + --tensor-parallel-size $tp --num-iters $ITER + done +done +done diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 5e1d63a6a62eb..63ea7b856c1ee 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -13,6 +13,8 @@ divide, split_tensor_along_last_dim) from vllm.model_executor.utils import set_weight_attrs from vllm.logger import init_logger +from vllm.model_executor.layers.tuned_gemm import tgemm + logger = init_logger(__name__) @@ -66,10 +68,14 @@ def apply_weights(self, bias: Optional[torch.Tensor] = None) -> torch.Tensor: weight = weights["weight"] if self.separate_bias_add: + #print(f">>> HELOOOOOOOOOOOOOO apply_weights {x.shape}, {weight.shape}, {bias}") + if bias: return F.linear(x, weight) + bias return F.linear(x, weight) - return F.linear(x, weight, bias) + #tgemm.mm(x,weight) + #return F.linear(x, weight, bias) + return tgemm.mm(x,weight) class ReplicatedLinear(torch.nn.Module): @@ -123,6 +129,7 @@ def __init__( def forward(self, x: torch.Tensor) -> torch.Tensor: bias = self.bias if not self.skip_bias_add else None output = self.linear_method.apply_weights(self.linear_weights, x, bias) + #print(f">>> output is {output}") output_bias = self.bias if self.skip_bias_add else None return output, output_bias @@ -548,7 +555,7 @@ def forward(self, input_): output_ = tensor_model_parallel_all_reduce(output_parallel) else: output_ = output_parallel - + #print(f">>> ROWPARALLEL {output_.shape}") if not self.skip_bias_add: output = output_ + self.bias if self.bias is not None else output_ output_bias = None diff --git a/vllm/model_executor/layers/tuned_gemm.py b/vllm/model_executor/layers/tuned_gemm.py new file mode 100644 index 0000000000000..556a972bc3e34 --- /dev/null +++ b/vllm/model_executor/layers/tuned_gemm.py @@ -0,0 +1,93 @@ +import torch +import torch.nn.functional as F +from rocsolidxgemm import rocb_create_extension,rocb_mm +from hipbsolidxgemm import hipb_create_extension,hipb_mm +import os +import yaml +import pandas as pd +#from vllm import custom_ops + + +class TunedGemm: + def __init__(self): + #rocb_create_extension() + #hipb_create_extension() + self.extensions_created = False + self.bestsols = {} + self.load_best_sols() + self.create_ds() + def load_best_sols(self): + perfbits = {} + perf_file = os.environ.get('VLLM_PERF_YAML') + if perf_file is not None: + with open(perf_file, 'r') as file: + perfbits = yaml.safe_load(file) + + tune_file = perfbits.get('tuned_gemm_csv',None) + if tune_file is not None: + self.bestsols = pd.read_csv(tune_file,index_col=[0]) + def apply_custom(self,ds): + M,N,K = ds['M'],ds['N'],ds['K'] + #apply custom matvec (only for f16 dtype) + return ds + if N==1: + ds1 = ds.copy() + ds1['libtype'] = 'custom' + if K==8192 and (M==1280 or M==7168): + ds1['solidx'] = 8 + return ds1 + elif K==3584 and M==8192: + ds1['solidx'] = 8 + return ds1 + elif K<=8192 and K%8==0 and M%4==0: + ds1['solidx'] = 1 + return ds1 + return ds + def create_ds(self): + df = self.bestsols + solds = {} + for i in range(len(df)): + ds = self.apply_custom(df.iloc[i]) + key = (ds['M'],ds['N'],ds['K']) + if ds['libtype']=='hipblaslt': soltype = 1 + elif ds['libtype']=='rocblas': soltype = 2 + elif ds['libtype']=='custom': soltype = 3 + solds[key] = (soltype,int(ds['solidx'])) + self.solids = solds + #print('>>>',solds) + def query_sol(self,m,n,k): + return self.solids.get((m,n,k),(0,0)) + def mm(self,inp,weights): + inp_view=inp.view(-1,inp.size(-1)) + #print(f'>>>inp_view {inp_view.shape}') + if self.extensions_created == False: + rocb_create_extension() + hipb_create_extension() + self.extensions_created = True + soltype,solidx = self.query_sol(m=weights.shape[0],n=inp_view.shape[0],k=inp_view.shape[1]) + if soltype==1: + #print(">>> found hipblas") + out = hipb_mm(inp_view,weights.t(),solidx) + elif soltype==3: + ##only matvec is supported currently + out = torch.empty(inp.shape[0],weights.shape[0],dtype=torch.float16,device='cuda') + #print('>>>Matvec',inp.shape,weights.shape,soltype,solidx) + if solidx<=1: + custom_ops.LLMM1(weights,inp,out,4) + elif solidx==2: + custom_ops.LLMM1(weights,inp,out,2) + elif solidx==8: + custom_ops.LLMM1(weights,inp,out,8) + elif solidx==20: + custom_ops.LLZZ(weights,inp,out,0) + elif solidx==21: + custom_ops.LLZZ(weights,inp,out,1) + elif soltype==2: + #print(">>> found rocblas") + out = rocb_mm(inp_view,weights.t(),solidx) + else: + #print('>>>Tgemm Default',inp.shape,weights.shape,soltype,solidx) + out = F.linear(inp,weights) + return out.view(inp.shape[0], inp.shape[1], weights.shape[0]) + +tgemm = TunedGemm() \ No newline at end of file From 6f281079a78e6cd2ed3a3cedd2fa1ec1d6ee2a5a Mon Sep 17 00:00:00 2001 From: Doug Lehr Date: Sun, 11 Feb 2024 18:10:01 +0000 Subject: [PATCH 023/159] Enable torchrun vs Ray This is a bit of a hack. But Ray seems to have some serious perf degradation when running multi gpu latency benchmarks. Allow distributed to be used when Ray is disabled, and make sure we connect via env ranking instead of tcp/port based. --- vllm/config.py | 2 +- vllm/engine/llm_engine.py | 4 +-- vllm/engine/ray_utils.py | 20 +++++++++++--- .../parallel_utils/communication_op.py | 26 +++++++++++-------- vllm/worker/worker.py | 10 ++++--- 5 files changed, 41 insertions(+), 21 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 8acd15a3b7d9a..11952d9471d8f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -342,7 +342,7 @@ def __init__( self.world_size = pipeline_parallel_size * tensor_parallel_size if self.world_size > 1: - self.worker_use_ray = True + self.worker_use_ray = False self._verify_args() def _verify_args(self) -> None: diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 0dedc232292dd..7c6808e32f3fa 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -129,8 +129,8 @@ def _init_workers(self): # before CUDA_VISIBLE_DEVICES is set in the Worker from vllm.worker.worker import Worker - assert self.parallel_config.world_size == 1, ( - "Ray is required if parallel_config.world_size > 1.") + # assert self.parallel_config.world_size == 1, ( + # "Ray is required if parallel_config.world_size > 1.") self.workers: List[Worker] = [] distributed_init_method = get_distributed_init_method( diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py index 1cb5fcda344f1..2abf571c5fc61 100644 --- a/vllm/engine/ray_utils.py +++ b/vllm/engine/ray_utils.py @@ -1,3 +1,4 @@ +import socket from typing import Optional, List, Tuple, TYPE_CHECKING from vllm.config import ParallelConfig @@ -50,6 +51,10 @@ def set_cuda_visible_devices(self, device_ids) -> None: if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup +def get_open_port(): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] def initialize_cluster( parallel_config: ParallelConfig, @@ -82,11 +87,18 @@ def initialize_cluster( else: ray.init(address=ray_address, ignore_reinit_error=True) + # if not parallel_config.worker_use_ray: + # assert parallel_config.world_size == 1, ( + # "Ray is required if parallel_config.world_size > 1.") + # return None if not parallel_config.worker_use_ray: - assert parallel_config.world_size == 1, ( - "Ray is required if parallel_config.world_size > 1.") - return None - + # Initialize cluster locally. + port = get_open_port() + # We need to setup the distributed init method to make sure + # the distributed megatron code (e.g., get world size) works correctly. + distributed_init_method = f"tcp://localhost:{port}" + return distributed_init_method, None + # Create placement group for worker processes current_placement_group = ray.util.get_current_placement_group() if current_placement_group: diff --git a/vllm/model_executor/parallel_utils/communication_op.py b/vllm/model_executor/parallel_utils/communication_op.py index fff6920be72b0..720b52cfc6904 100644 --- a/vllm/model_executor/parallel_utils/communication_op.py +++ b/vllm/model_executor/parallel_utils/communication_op.py @@ -63,6 +63,7 @@ def tensor_model_parallel_gather(input_: torch.Tensor, all the ranks. """ world_size = get_tensor_model_parallel_world_size() + # Bypass the function if we are using only 1 GPU. if world_size == 1: return input_ @@ -72,19 +73,22 @@ def tensor_model_parallel_gather(input_: torch.Tensor, # Convert negative dim to positive. dim += input_.dim() # Allocate output tensor. - if get_tensor_model_parallel_rank() == dst: - gather_list = [torch.empty_like(input_) for _ in range(world_size)] - else: - gather_list = None + gather_list = [torch.empty_like(input_) for _ in range(world_size)] + # if get_tensor_model_parallel_rank() == dst: + # gather_list = [torch.empty_like(input_) for _ in range(world_size)] + # else: + # gather_list = None # Gather. - torch.distributed.gather(input_, - gather_list, - dst=dst, + + #print(f'>>> world size {world_size}, {gather_list}, {dst} {get_tensor_model_parallel_group()}') + torch.distributed.all_gather(gather_list, input_, group=get_tensor_model_parallel_group()) - if get_tensor_model_parallel_rank() == dst: - output_tensor = torch.cat(gather_list, dim=dim) - else: - output_tensor = None + output_tensor = torch.cat(gather_list, dim=dim) + # if get_tensor_model_parallel_rank() == dst: + # output_tensor = torch.cat(gather_list, dim=dim) + # else: + # output_tensor = None + #print(f'>>> output_tensor {output_tensor}, {dst}, {dim}') return output_tensor diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 845283586e147..aafd7306acf5d 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -72,7 +72,10 @@ def init_model(self) -> None: # This env var set by Ray causes exceptions with graph building. os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None) - self.device = torch.device(f"cuda:{self.local_rank}") + self.rank = self.rank if self.rank is not None else int( + os.getenv("RANK", "-1")) + local_rank = int(os.getenv("LOCAL_RANK", "0")) + self.device = torch.device(f"cuda:{local_rank}") torch.cuda.set_device(self.device) _check_if_gpu_supports_dtype(self.model_config.dtype) @@ -240,8 +243,9 @@ def _init_distributed_environment( torch.distributed.init_process_group( backend="nccl", world_size=parallel_config.world_size, - rank=rank, - init_method=distributed_init_method, + #rank=rank, + #init_method=distributed_init_method, + init_method="env://", ) # A small all_reduce for warmup. From 184806e673c9e473f5559df5f636e4086ee343b0 Mon Sep 17 00:00:00 2001 From: Doug Lehr Date: Sun, 11 Feb 2024 18:38:56 +0000 Subject: [PATCH 024/159] Add custom matvec kernels and sampler matmul call tuned_gemm --- csrc/custom/custom.cpp | 74 +++++ csrc/custom/custom_kernels.cu | 367 +++++++++++++++++++++++ csrc/custom/fused_kernels.cu | 192 ++++++++++++ setup.py | 6 + vllm/model_executor/layers/sampler.py | 5 +- vllm/model_executor/layers/tuned_gemm.py | 20 +- 6 files changed, 657 insertions(+), 7 deletions(-) create mode 100644 csrc/custom/custom.cpp create mode 100644 csrc/custom/custom_kernels.cu create mode 100644 csrc/custom/fused_kernels.cu diff --git a/csrc/custom/custom.cpp b/csrc/custom/custom.cpp new file mode 100644 index 0000000000000..aeff9cc5e6ae7 --- /dev/null +++ b/csrc/custom/custom.cpp @@ -0,0 +1,74 @@ +#include +#include +#include +#include + +namespace py = pybind11; + +// declare templates for front (cpp) and back (cuda) sides of function: +//template + +void LLGemm_Silu(void *in_a, void *in_b, void *out_c, const int M, const int K, cudaStream_t stream, const int rows_per_block); +void LLMM_Silu(at::Tensor in_a, at::Tensor in_b, at::Tensor out_c, const int rows_per_block) { + int M = in_a.size(0); + int K = in_a.size(1); + LLGemm_Silu(in_a.data_ptr(), in_b.data_ptr(), + out_c.data_ptr(), M, K, at::cuda::getCurrentCUDAStream(),rows_per_block); +} + +void LLGemm1(void *in_a, void *in_b, void *out_c, const int M, const int K, cudaStream_t stream,const int rows_per_block); + +//template +void LLMM1(at::Tensor in_a, at::Tensor in_b, at::Tensor out_c, const int rows_per_block=4) { + int M = in_a.size(0); + int K = in_a.size(1); + //if (N != in_b.numel()) + // throw std::invalid_argument("Size mismatch A.numel(): " + std::to_string(in_a.numel()) + // + ", B.numel(): " + std::to_string(in_b.numel())); + + //out_c.resize_({N}); + + // call the kernel function... + LLGemm1(in_a.data_ptr(), in_b.data_ptr(), + out_c.data_ptr(), M, K, at::cuda::getCurrentCUDAStream(),rows_per_block); +} + +void LLGemmZZ(void *in_a, void *in_b, void *out_c, const int M, const int K, cudaStream_t stream, const int solidx); + +void LLZZ(at::Tensor in_a, at::Tensor in_b, at::Tensor out_c, const int solidx=0) { + int M = in_a.size(0); + int K = in_a.size(1); + + LLGemmZZ(in_a.data_ptr(), in_b.data_ptr(), + out_c.data_ptr(), M, K, at::cuda::getCurrentCUDAStream(),solidx); +} +// instantiate the CPP template for T=float: +//template void AddGPU(at::Tensor in_a, at::Tensor in_b, at::Tensor out_c); + + +void MMGPUKernel(float *in_a, float *in_b, float *out_c, + int numARows, int numAColumns, + int numBRows, int numBColumns, + int numCRows, int numCColumns, + cudaStream_t stream); + + +void MMCustomGPU(at::Tensor in_a, at::Tensor in_b, at::Tensor out_c) { + auto matA_sizes { in_a.sizes() }; + auto matB_sizes { in_b.sizes() }; + auto matO_sizes { out_c.sizes() }; + MMGPUKernel(in_a.data_ptr(), in_b.data_ptr(), out_c.data_ptr(), + matA_sizes[0], matA_sizes[1], + matB_sizes[0], matB_sizes[1], + matO_sizes[0], matO_sizes[1], + at::cuda::getCurrentCUDAStream()); +} + +// declare the extension module with the AddGPU function: +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){ + m.doc() = "pybind11 example plugin"; + m.def("LLMM1", &LLMM1); + m.def("LLMM_Silu", &LLMM_Silu); + m.def("LLZZ", &LLZZ); +//m.def("MMCustomGPU", &MMCustomGPU); +} diff --git a/csrc/custom/custom_kernels.cu b/csrc/custom/custom_kernels.cu new file mode 100644 index 0000000000000..b5ab0dbe8317c --- /dev/null +++ b/csrc/custom/custom_kernels.cu @@ -0,0 +1,367 @@ +#include +#include +#include +#include + +constexpr int WARP_SIZE = 64; + +template +__device__ __forceinline__ T loadnt(T* addr) { + return __builtin_nontemporal_load(addr); +} + +__device__ __forceinline__ float4 load_ntmprl(const float4* addr) { + auto addr_alias = reinterpret_cast(addr); + auto dat0 = loadnt(addr_alias); + auto dat1 = loadnt(addr_alias + 1); + auto dat2 = loadnt(addr_alias + 2); + auto dat3 = loadnt(addr_alias + 3); + //auto dat0 = *(addr_alias); + //auto dat1 = *(addr_alias+1); + //auto dat2 = *(addr_alias+2); + //auto dat3 = *(addr_alias+3); + return make_float4(dat0,dat1,dat2,dat3); +} + +//TBlock fetches entire rows of A, and entire col of B (K dimension); assume N=1 for time being +//grid is M/A_NUM_ROWS blocks +template +__global__ void LLGemm1_kernel(float4 *af4, __half2 *bf4, __half2 *c) { + __shared__ float red_smem[NUM_A_ROWS_PER_BLOCK][WARP_SIZE]; + const int row_addr = blockIdx.x * NUM_A_ROWS_PER_BLOCK * blockDim.x; + //int row_addr_1 = row_addr + CUDA_NUM_THREADS; + //int row_addr_2 = row_addr_1 + CUDA_NUM_THREADS; + //int row_addr_3 = row_addr_2 + CUDA_NUM_THREADS; + const int threadid = threadIdx.x; + const int warp = threadIdx.x / WARP_SIZE; + const int lane = threadIdx.x % WARP_SIZE; + const int num_warps = blockDim.x / WARP_SIZE; + const int qwarpid = threadid/16; + const int qthreadid = threadid%16; + float4 rowA_elem4[NUM_A_ROWS_PER_BLOCK]; + //float4 colB_elem4; + __half2 colB_elem4x,colB_elem4y,colB_elem4z,colB_elem4w; + float4 sum4; //[NUM_A_ROWS_PER_BLOCK]; + float acc[NUM_A_ROWS_PER_BLOCK]; //= 0.0; + __half2 acch2; + __half2 oval; + + //rowA_elem4 = af4[row_addr + threadid]; + //__syncthreads(); + //rowA_elem4_1 = af4[row_addr_1 + threadid]; + //rowA_elem4_2 = af4[row_addr_2 + threadid]; + //rowA_elem4_3 = af4[row_addr_3 + threadid]; + #pragma unroll + for (int i=0; i(&colB_elem4); + //auto Bf2x = *Bh2ptr; + //auto Bf2y = *(Bh2ptr+1); + //auto Bf2z = *(Bh2ptr+2); + //auto Bf2w = *(Bh2ptr+3); + auto Ah2ptr = reinterpret_cast<__half2 *>(&rowA_elem4); + __half2 *ah2lptr; + #pragma unroll + for (int i=0; i= 1; mask /= 2) { + #pragma unroll + for (int i=0; i= 1; mask /= 2) { + //#pragma unroll + //for (int i=0; i8) { + // #pragma unroll + // for (int j=0; j<8; j++) { + // acc[2*threadid] += red_smem[2*threadid][j]; + // acc[2*threadid+1] += red_smem[2*threadid+1][j]; + // } + // } + // #pragma unroll + // for (int j=0; j +void LLGemm1(void *in_a, void *in_b, void *out_c, const int M, const int K, cudaStream_t stream, const int rows_per_block=4) { + float4 *af4 = reinterpret_cast(in_a); + auto *bf4 = reinterpret_cast<__half2*>(in_b); + auto *c = reinterpret_cast<__half2*>(out_c); + //constexpr int A_ROWS_PER_BLOCK = 8; + const int NUM_THREADS = K*2/16; + int NUM_BLOCKS = M/rows_per_block; + if (rows_per_block==2) { + LLGemm1_kernel<2><<>>(af4, bf4, c); + } + else if (rows_per_block==4) { + LLGemm1_kernel<4><<>>(af4, bf4, c); + } + else if (rows_per_block==8) { + LLGemm1_kernel<8><<>>(af4, bf4, c); + } + else if (rows_per_block==16) { + LLGemm1_kernel<16><<>>(af4, bf4, c); + } + else { + NUM_BLOCKS = M/4; + LLGemm1_kernel<4><<>>(af4, bf4, c); + } + + + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err) + throw std::runtime_error("CUDA kernel failed : " + std::to_string(err)); +} + +// instantiate the kernel template for T=float: +//template void AddGPUKernel(float *in_a, float *in_b, float *out_c, const int M, const int K, cudaStream_t stream); + +const unsigned int TILE_WIDTH = 32; + +// Compute C = A * B +__global__ void matrixMultiplyShared(float *A, float *B, float *C, + int numARows, int numAColumns, + int numBRows, int numBColumns, + int numCRows, int numCColumns) { + __shared__ float sA[TILE_WIDTH][TILE_WIDTH]; // Tile size of 32x32 + __shared__ float sB[TILE_WIDTH][TILE_WIDTH]; + + int Row = blockDim.y * blockIdx.y + threadIdx.y; + int Col = blockDim.x * blockIdx.x + threadIdx.x; + float Cvalue = 0.0; + sA[threadIdx.y][threadIdx.x] = 0.0; + sB[threadIdx.y][threadIdx.x] = 0.0; + + for (int ph = 0; ph < (((numAColumns - 1) / TILE_WIDTH) + 1); ph++) { + if ((Row < numARows) && (threadIdx.x + (ph * TILE_WIDTH)) < numAColumns) { + sA[threadIdx.y][threadIdx.x] = A[(Row * numAColumns) + threadIdx.x + (ph * TILE_WIDTH)]; + } else { + sA[threadIdx.y][threadIdx.x] = 0.0; + } + if (Col < numBColumns && (threadIdx.y + ph * TILE_WIDTH) < numBRows) { + sB[threadIdx.y][threadIdx.x] = B[(threadIdx.y + ph * TILE_WIDTH) * numBColumns + Col]; + } else { + sB[threadIdx.y][threadIdx.x] = 0.0; + } + __syncthreads(); + for (int j = 0; j < TILE_WIDTH; ++j) { + Cvalue += sA[threadIdx.y][j] * sB[j][threadIdx.x]; + } + } + if (Row < numCRows && Col < numCColumns) { + C[Row * numCColumns + Col] = Cvalue; + } +} + + +void MMGPUKernel(float *in_a, float *in_b, float *out_c, + int numARows, int numAColumns, + int numBRows, int numBColumns, + int numCRows, int numCColumns, + cudaStream_t stream) { + + // Initialize the grid and block dimensions + dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1); + dim3 dimGrid((numCColumns / TILE_WIDTH) + 1, (numCRows / TILE_WIDTH) + 1, 1); + //@@ Launch the GPU Kernel here + matrixMultiplyShared <<>> + (in_a, in_b, out_c, numARows, numAColumns, numBRows, numBColumns, numCRows, numCColumns); + + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err) + throw std::runtime_error("CUDA kernel failed : " + std::to_string(err)); +} + + + +template +__global__ +__launch_bounds__(512) +void HGEMV_WFPerRow(int m, int n, const _Float16 *A, int lda, const _Float16 *x, _Float16 *y) +{ + int num_row_per_block = CTA / nThreads_per_row; + int row_id = (blockIdx.x*num_row_per_block+threadIdx.y)*MT0; + int inc = (gridDim.x * num_row_per_block)*MT0; + + while (row_id < m) { + float2 sum2[MT0]; + +#pragma unroll + for (int i = 0; i < MT0; ++i) + { + sum2[i] = {0.0,0.0}; + } + + for (int j = threadIdx.x; j < n; j += (nThreads_per_row*MT1)){ + bool is_active = j < n; + if (is_active) { + float2 x2[MT1>>1]; +#pragma unroll + for(int offset = 0; offset < MT1; offset += 2) + { + x2[offset>>1] = {x[j+nThreads_per_row*offset], x[j+nThreads_per_row*(offset+1)]}; + } + float2 a2[MT0][MT1>>1]; +#pragma unroll + for (int i = 0; i < MT0; i++) + { +#pragma unroll + for (int offset = 0; offset < MT1; offset += 2) + { + a2[i][offset>>1] = {A[(row_id+i)*n+j+nThreads_per_row*offset], A[(row_id+i)*n+j+nThreads_per_row*(offset+1)]}; + } + } + +#pragma unroll + for (int i = 0; i < MT0; i++) + { +#pragma unroll + for (int offset = 0; offset < (MT1>>1); offset++) + { + sum2[i] += a2[i][offset]*x2[offset]; + } + } + + } + } + float sum[MT0]; +#pragma unroll + for (int i = 0; i < MT0; i++) + { + sum[i] = sum2[i].x+sum2[i].y; + } + +#pragma unroll + for (int i = 0; i < MT0; i++) + { +#pragma unroll + for (int offset = nThreads_per_row >> 1; offset >= 1; offset = offset >> 1) { + sum[i] += __shfl_down(sum[i], offset, nThreads_per_row); + } + } + if (threadIdx.x == 0) + { +#pragma unroll + for (int i = 0; i < MT0; i++) + { + y[row_id+i] = sum[i]; + } + } + row_id += inc; + } +} + +void LLGemmZZ(void *in_a, void *in_b, void *out_c, const int M, const int K, cudaStream_t stream, const int solidx=0) { + //m -> M, n-> K + dim3 grid(1024); + dim3 block(64, 8); + if (solidx==0) { + HGEMV_WFPerRow<64, 512, 4, 8><<>>(M, K, reinterpret_cast(in_a), K, + reinterpret_cast(in_b),reinterpret_cast<_Float16*>(out_c)); + } + else if (solidx==1) { + HGEMV_WFPerRow<64, 512, 2, 8><<>>(M, K, reinterpret_cast(in_a), K, + reinterpret_cast(in_b),reinterpret_cast<_Float16*>(out_c)); + } + else if (solidx==2) { + HGEMV_WFPerRow<64, 512, 1, 8><<>>(M, K, reinterpret_cast(in_a), K, + reinterpret_cast(in_b),reinterpret_cast<_Float16*>(out_c)); + } + else { + HGEMV_WFPerRow<64, 512, 4, 8><<>>(M, K, reinterpret_cast(in_a), K, + reinterpret_cast(in_b),reinterpret_cast<_Float16*>(out_c)); + } + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err) + throw std::runtime_error("CUDA kernel failed : " + std::to_string(err)); +} diff --git a/csrc/custom/fused_kernels.cu b/csrc/custom/fused_kernels.cu new file mode 100644 index 0000000000000..5a4a11f914eb9 --- /dev/null +++ b/csrc/custom/fused_kernels.cu @@ -0,0 +1,192 @@ +#include +#include +#include +#include + +constexpr int WARP_SIZE = 64; + +template +__device__ __forceinline__ T silu(const T& x) { + // x * sigmoid(x) + return (T) (((float) x) / (1.0f + expf((float) -x))); +} + +template +__device__ __forceinline__ T loadnt(T* addr) { + return __builtin_nontemporal_load(addr); +} + +__device__ __forceinline__ float4 load_ntmprl(const float4* addr) { + auto addr_alias = reinterpret_cast(addr); + auto dat0 = loadnt(addr_alias); + auto dat1 = loadnt(addr_alias + 1); + auto dat2 = loadnt(addr_alias + 2); + auto dat3 = loadnt(addr_alias + 3); + //auto dat0 = *(addr_alias); + //auto dat1 = *(addr_alias+1); + //auto dat2 = *(addr_alias+2); + //auto dat3 = *(addr_alias+3); + return make_float4(dat0,dat1,dat2,dat3); +} + +//TBlock fetches entire rows of A, and entire col of B (K dimension); assume N=1 for time being +//grid is M/A_NUM_ROWS blocks +template +__global__ void LLGemm_Silu_kernel(float4 *af4, __half2 *bf4, _Float16 *c, const int d) { + __shared__ float red_smem[NUM_A_ROWS_PER_BLOCK][WARP_SIZE]; + const int row_addr = blockIdx.x * NUM_A_ROWS_PER_BLOCK/2 * blockDim.x; + const int row_addr_d = row_addr + d * blockDim.x; + //int row_addr_1 = row_addr + CUDA_NUM_THREADS; + //int row_addr_2 = row_addr_1 + CUDA_NUM_THREADS; + //int row_addr_3 = row_addr_2 + CUDA_NUM_THREADS; + const int threadid = threadIdx.x; + const int warp = threadIdx.x / WARP_SIZE; + const int lane = threadIdx.x % WARP_SIZE; + const int num_warps = blockDim.x / WARP_SIZE; + const int qwarpid = threadid/16; + const int qthreadid = threadid%16; + float4 rowA_elem4[NUM_A_ROWS_PER_BLOCK]; + //float4 colB_elem4; + __half2 colB_elem4x,colB_elem4y,colB_elem4z,colB_elem4w; + float4 sum4; //[NUM_A_ROWS_PER_BLOCK]; + float acc[NUM_A_ROWS_PER_BLOCK]; //= 0.0; + __half2 acch2; + __half2 oval; + + //rowA_elem4 = af4[row_addr + threadid]; + //__syncthreads(); + //rowA_elem4_1 = af4[row_addr_1 + threadid]; + //rowA_elem4_2 = af4[row_addr_2 + threadid]; + //rowA_elem4_3 = af4[row_addr_3 + threadid]; + #pragma unroll + for (int i=0; i(&colB_elem4); + //auto Bf2x = *Bh2ptr; + //auto Bf2y = *(Bh2ptr+1); + //auto Bf2z = *(Bh2ptr+2); + //auto Bf2w = *(Bh2ptr+3); + auto Ah2ptr = reinterpret_cast<__half2 *>(&rowA_elem4); + __half2 *ah2lptr; + #pragma unroll + for (int i=0; i= 1; mask /= 2) { + #pragma unroll + for (int i=0; i= 1; mask /= 2) { + //#pragma unroll + //for (int i=0; i +void LLGemm_Silu(void *in_a, void *in_b, void *out_c, const int M, const int K, cudaStream_t stream, const int rows_per_block=4) { + float4 *af4 = reinterpret_cast(in_a); + auto *bf4 = reinterpret_cast<__half2*>(in_b); + auto *c = reinterpret_cast<_Float16*>(out_c); + const int d = M/2; + const int NUM_THREADS = K*2/16; + int NUM_BLOCKS = M/rows_per_block; + if (rows_per_block==2) { + LLGemm_Silu_kernel<2><<>>(af4, bf4, c, d); + } + else if (rows_per_block==4) { + LLGemm_Silu_kernel<4><<>>(af4, bf4, c, d); + } + else if (rows_per_block==8) { + LLGemm_Silu_kernel<8><<>>(af4, bf4, c, d); + } + else if (rows_per_block==16) { + LLGemm_Silu_kernel<16><<>>(af4, bf4, c, d); + } + else { + NUM_BLOCKS = M/4; + LLGemm_Silu_kernel<4><<>>(af4, bf4, c, d); + } + + + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err) + throw std::runtime_error("CUDA kernel failed : " + std::to_string(err)); +} + diff --git a/setup.py b/setup.py index 15b9a78f6ca27..15715225490af 100644 --- a/setup.py +++ b/setup.py @@ -348,6 +348,12 @@ def get_torch_arch_list() -> Set[str]: ) ext_modules.append(vllm_extension) +custom_extension = CUDAExtension( + name="vllm.custom_ops", + sources=["csrc/custom/custom.cpp", "csrc/custom/custom_kernels.cu", "csrc/custom/fused_kernels.cu"], + extra_compile_args={"cxx": CXX_FLAGS, "nvcc": NVCC_FLAGS}, +) +ext_modules.append(custom_extension) def get_path(*filepath) -> str: return os.path.join(ROOT_DIR, *filepath) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index bc86a916b5bbf..d7e56850f076a 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -11,6 +11,7 @@ from vllm.sequence import (PromptLogprobs, SampleLogprobs, SamplerOutput, SequenceData, SequenceGroupOutput, SequenceOutput) +from vllm.model_executor.layers.tuned_gemm import tgemm class Sampler(nn.Module): """Samples the next tokens from the model's outputs. @@ -38,7 +39,8 @@ def __init__(self, def _get_logits(self, hidden_states: torch.Tensor, embedding: torch.Tensor, embedding_bias: Optional[torch.Tensor]) -> torch.Tensor: # Get the logits for the next tokens. - logits = torch.matmul(hidden_states, embedding.t()) + #logits = torch.matmul(hidden_states, embedding.t()) + logits = tgemm.mm(hidden_states, embedding) if embedding_bias is not None: logits += embedding_bias logits = tensor_model_parallel_gather(logits) @@ -59,7 +61,6 @@ def forward( # Get the logits for the next tokens. logits = self._get_logits(hidden_states, embedding, embedding_bias) - # Only perform sampling in the driver worker. # Note: `_get_logits` is still distributed across TP workers because # the `embedding` weight is distributed across TP workers. diff --git a/vllm/model_executor/layers/tuned_gemm.py b/vllm/model_executor/layers/tuned_gemm.py index 556a972bc3e34..84994d7f9daeb 100644 --- a/vllm/model_executor/layers/tuned_gemm.py +++ b/vllm/model_executor/layers/tuned_gemm.py @@ -5,7 +5,7 @@ import os import yaml import pandas as pd -#from vllm import custom_ops +from vllm import custom_ops class TunedGemm: @@ -29,7 +29,7 @@ def load_best_sols(self): def apply_custom(self,ds): M,N,K = ds['M'],ds['N'],ds['K'] #apply custom matvec (only for f16 dtype) - return ds + #return ds if N==1: ds1 = ds.copy() ds1['libtype'] = 'custom' @@ -58,7 +58,14 @@ def create_ds(self): def query_sol(self,m,n,k): return self.solids.get((m,n,k),(0,0)) def mm(self,inp,weights): - inp_view=inp.view(-1,inp.size(-1)) + # F.Linear can take a 3 dimensional input. vllm uses this for linear units. + # However, sampler will use torch.matmul with 2 dimensions only + if inp.dim() == 3: + inp_view=inp.view(-1,inp.size(-1)) + batched = True + else: + inp_view = inp + batched = False #print(f'>>>inp_view {inp_view.shape}') if self.extensions_created == False: rocb_create_extension() @@ -86,8 +93,11 @@ def mm(self,inp,weights): #print(">>> found rocblas") out = rocb_mm(inp_view,weights.t(),solidx) else: - #print('>>>Tgemm Default',inp.shape,weights.shape,soltype,solidx) + print('>>>Tgemm Default',inp.shape,weights.shape,soltype,solidx) out = F.linear(inp,weights) - return out.view(inp.shape[0], inp.shape[1], weights.shape[0]) + if batched: + return out.view(inp.shape[0], inp.shape[1], weights.shape[0]) + else: + return out tgemm = TunedGemm() \ No newline at end of file From 0d59cfbb6ce1d19760cda992490abe8211b638cd Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Mon, 12 Feb 2024 22:31:09 +0000 Subject: [PATCH 025/159] Initial skeleton; scaling factors in CacheEngine and PagedAttention --- 3rdparty/quantizer/extract_scales.py | 82 +++++++++++++++++++++ vllm/config.py | 7 ++ vllm/engine/arg_utils.py | 11 +++ vllm/engine/llm_engine.py | 1 + vllm/model_executor/layers/attention.py | 8 +- vllm/model_executor/models/aquila.py | 7 +- vllm/model_executor/models/baichuan.py | 7 +- vllm/model_executor/models/bloom.py | 7 +- vllm/model_executor/models/chatglm.py | 5 +- vllm/model_executor/models/deepseek.py | 7 +- vllm/model_executor/models/falcon.py | 7 +- vllm/model_executor/models/gpt2.py | 5 +- vllm/model_executor/models/gpt_bigcode.py | 5 +- vllm/model_executor/models/gpt_j.py | 7 +- vllm/model_executor/models/gpt_neox.py | 7 +- vllm/model_executor/models/internlm.py | 7 +- vllm/model_executor/models/internlm2.py | 7 +- vllm/model_executor/models/llama.py | 7 +- vllm/model_executor/models/mistral.py | 7 +- vllm/model_executor/models/mixtral.py | 7 +- vllm/model_executor/models/mixtral_quant.py | 7 +- vllm/model_executor/models/mpt.py | 7 +- vllm/model_executor/models/opt.py | 6 +- vllm/model_executor/models/phi.py | 7 +- vllm/model_executor/models/qwen.py | 7 +- vllm/model_executor/models/qwen2.py | 7 +- vllm/model_executor/models/stablelm.py | 7 +- vllm/model_executor/models/yi.py | 7 +- vllm/model_executor/weight_utils.py | 16 ++++ vllm/worker/cache_engine.py | 54 ++++++++++++-- vllm/worker/model_runner.py | 8 +- 31 files changed, 262 insertions(+), 79 deletions(-) create mode 100644 3rdparty/quantizer/extract_scales.py diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py new file mode 100644 index 0000000000000..ace05e52a69b8 --- /dev/null +++ b/3rdparty/quantizer/extract_scales.py @@ -0,0 +1,82 @@ +import argparse +import json +import os +from vllm.model_executor.weight_utils import ( + hf_model_weights_iterator, + prepare_hf_model_weights +) + +default_output_name = "kv_cache_scales.json" + + +def main(args): + layer_scale_factors_map = {} + if args.output is None: + hf_folder, _, _ = prepare_hf_model_weights(args.model, + args.cache_dir, + args.load_format, + revision=args.revision) + output_file = os.path.join(hf_folder, default_output_name) + else: + output_file = os.path.join(args.output, default_output_name) + if not os.path.isdir(args.output): + os.makedirs(args.output, exist_ok=True) + + for name, param in hf_model_weights_iterator(args.model, + args.cache_dir, + args.load_format, + args.revision): + if "kv_cache_scaling_factor" in name: + nums = [int(s) for s in name.split('.') if s.isdigit()] + assert len(nums) == 1, f"Could not determine layer idx for {name}!" + layer_idx = nums[0] + assert layer_idx not in layer_scale_factors_map, f"Duplicate scaling " \ + f"factor corresponding to layer {layer_idx}!" + try: + layer_scale_factors_map[layer_idx] = param.item() + except RuntimeError: + print("This utility supports only per-tensor scalar scale factors " + f"for now. The tensor\n {name} = {param} is an invalid " + "scale factor!") + raise + if len(layer_scale_factors_map) == 0: + print("WARNING: No KV cache scale factors found! No output saved.") + else: + with open(output_file, 'w') as f: + json.dump(layer_scale_factors_map, f, sort_keys=True) + print(f"Completed! KV cache scaling factors saved to {output_file}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="This simple utility extracts the " + "KV cache scaling factors from a quantized HF model " + "and saves them to a JSON file compatible with later " + "use by vLLM (pass this file to the appropriate " + "runtime typically using the argument " + "--kv-cache-scales ). This is only used " + "if the KV cache dtype is FP8.") + parser.add_argument("--model", + help="Specify either a directory or name of a HF model. If the model " + "does not exist, this utility will attempt to download said model " + "from the HF repo.", + required=True) + parser.add_argument("--cache_dir", + help="Optionally specify a cache directory to use for a HF model " + "download.", + default=None) + parser.add_argument("--load_format", + help="Optionally specify the format of the model's tensor files " + "containing the KV cache scaling factors.", + choices=["auto", "safetensors", "pt", "npcache"], + default="auto") + parser.add_argument("--revision", + help="Optionally specify the model's revision number.", + default=None) + parser.add_argument("--output", + help="Specify the output directory. By default it will be saved in " + f"the model directory with the filename {default_output_name}, " + "however you can override this behavior here.", + default=None) + args = parser.parse_args() + + main(args) diff --git a/vllm/config.py b/vllm/config.py index d4cb6402c7269..b80385f23ef43 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -285,6 +285,7 @@ def __init__( gpu_memory_utilization: float, swap_space: int, cache_dtype: str, + kv_cache_scales: Optional[str] = None, sliding_window: Optional[int] = None, ) -> None: self.block_size = block_size @@ -292,6 +293,7 @@ def __init__( self.swap_space_bytes = swap_space * _GB self.cache_dtype = cache_dtype self.sliding_window = sliding_window + self.kv_cache_scales = kv_cache_scales self._verify_args() self._verify_cache_dtype() @@ -309,6 +311,11 @@ def _verify_cache_dtype(self) -> None: if self.cache_dtype == "auto": pass elif self.cache_dtype == "fp8": + if self.kv_cache_scales is None: + logger.warn(f"Using cache dtype {self.cache_dtype} but no " + "scaling factors provided. Defaulting to 1.0 " + "scales, be warned that this might lead to " + "inaccurate results!") if not is_hip(): nvcc_cuda_version = get_nvcc_cuda_version() if nvcc_cuda_version < Version("11.8"): diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index f4c6994c1f69e..f402396b333fc 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -18,6 +18,7 @@ class EngineArgs: load_format: str = 'auto' dtype: str = 'auto' kv_cache_dtype: str = 'auto' + kv_cache_scales: str = None seed: int = 0 max_model_len: Optional[int] = None worker_use_ray: bool = False @@ -131,6 +132,15 @@ def add_cli_args( help='Data type for kv cache storage. If "auto", will use model data type. ' 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') + parser.add_argument( + '--kv-cache-scales', + type=str, + default=None, + help='Path to the JSON file containing the KV cache scaling factors. ' + 'This should generally be supplied when KV cache dtype is FP8. Otherwise ' + 'the KV cache scaling factors default to 1.0, which will likely cause ' + 'accuracy issues. Note FP8 is not supported when cuda version is ' + 'lower than 11.8.') parser.add_argument('--max-model-len', type=int, default=None, @@ -279,6 +289,7 @@ def create_engine_configs( cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, + self.kv_cache_scales, model_config.get_sliding_window()) parallel_config = ParallelConfig(self.pipeline_parallel_size, self.tensor_parallel_size, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index e60efc5e54e16..f0d5724071a99 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -85,6 +85,7 @@ def __init__( f"quantization={model_config.quantization}, " f"enforce_eager={model_config.enforce_eager}, " f"kv_cache_dtype={cache_config.cache_dtype}, " + f"kv_cache_scales={cache_config.kv_cache_scales}, " f"seed={model_config.seed})") # TODO(woosuk): Print more configs in debug mode. diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index 91ed43f07c76e..2db2b517d52e1 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -65,6 +65,7 @@ def forward( value: torch.Tensor, key_cache: Optional[torch.Tensor], value_cache: Optional[torch.Tensor], + kv_cache_scaling_factor: Optional[torch.Tensor], input_metadata: InputMetadata, ) -> torch.Tensor: """PagedAttention forward pass. @@ -86,12 +87,17 @@ def forward( query = query.view(-1, self.num_heads, self.head_size) key = key.view(-1, self.num_kv_heads, self.head_size) value = value.view(-1, self.num_kv_heads, self.head_size) - + # Reshape the keys and values and store them in the cache. # If key_cache and value_cache are not provided, the new key and value # vectors will not be cached. This happens during the initial memory # profiling run. if key_cache is not None and value_cache is not None: + if kv_cache_scaling_factor is not None: + # Scale the key and value scaling factors for quantization + # by cache ops + key = key.div_(kv_cache_scaling_factor) + value = value.div_(kv_cache_scaling_factor) cache_ops.reshape_and_cache( key, value, diff --git a/vllm/model_executor/models/aquila.py b/vllm/model_executor/models/aquila.py index 2f2bd5ffb4a63..4cf15dd7585b5 100644 --- a/vllm/model_executor/models/aquila.py +++ b/vllm/model_executor/models/aquila.py @@ -45,7 +45,7 @@ from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs.aquila import AquilaConfig -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class AquilaMLP(nn.Module): @@ -162,8 +162,9 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index f08c3c8d257ff..83bd391b108eb 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -44,7 +44,7 @@ from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs.baichuan import BaiChuanConfig -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: @@ -177,8 +177,9 @@ def forward( q, k, v = qkv.chunk(chunks=3, dim=-1) if self.postion_embedding != "ALIBI": q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 4adfb6b78102f..f9954849bc081 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -40,7 +40,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: @@ -122,8 +122,9 @@ def forward( del position_ids # Unused. qkv, _ = self.query_key_value(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.dense(attn_output) return output diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index dca8d724f976b..3fc7cafaa006f 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -28,7 +28,7 @@ from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs import ChatGLMConfig -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class GLMAttention(nn.Module): @@ -104,13 +104,14 @@ def forward( qkv, _ = self.query_key_value(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(position_ids, q, k) - key_cache, value_cache = kv_cache + key_cache, value_cache, kv_cache_scaling_factor = kv_cache context_layer = self.attn( q, k, v, key_cache, value_cache, + kv_cache_scaling_factor, input_metadata, ) attn_output, _ = self.dense(context_layer) diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index fc727b8e661b3..683168679bfa4 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -51,7 +51,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class DeepseekMLP(nn.Module): @@ -253,8 +253,9 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 2b5e022312e3b..e7e8271e01c5b 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -47,7 +47,7 @@ from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs import RWConfig -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] FalconConfig = Union[HF_FalconConfig, RWConfig] @@ -185,8 +185,9 @@ def forward( q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) if self.use_rotary: q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) attn_output, bias = self.dense(attn_output) return attn_output, bias diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 661da0fe0434e..6fcb13f177113 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -40,7 +40,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class GPT2Attention(nn.Module): @@ -85,8 +85,9 @@ def forward( ) -> torch.Tensor: qkv, _ = self.c_attn(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) - key_cache, value_cache = kv_cache + key_cache, value_cache, kv_cache_scaling_factor = kv_cache attn_output = self.attn(q, k, v, key_cache, value_cache, + kv_cache_scaling_factor, input_metadata) attn_output, _ = self.c_proj(attn_output) return attn_output diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index ef4c1d4143c88..79993d938f571 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -41,7 +41,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class GPTBigCodeAttention(nn.Module): @@ -104,8 +104,9 @@ def forward( ], dim=-1, ) - key_cache, value_cache = kv_cache + key_cache, value_cache, kv_cache_scaling_factor = kv_cache attn_output = self.attn(q, k, v, key_cache, value_cache, + kv_cache_scaling_factor, input_metadata) attn_output, _ = self.c_proj(attn_output) return attn_output diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 5bab30d9d442e..642a53562662a 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -40,7 +40,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class GPTJAttention(nn.Module): @@ -98,8 +98,9 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) q, k = self.rotary_emb(position_ids, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) attn_output, _ = self.out_proj(attn_output) return attn_output diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 8f7e1063e0c1d..1ffb4aa6fc8df 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -40,7 +40,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class GPTNeoXAttention(nn.Module): @@ -99,8 +99,9 @@ def forward( qkv, _ = self.query_key_value(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) q, k = self.rotary_emb(position_ids, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.dense(attn_output) return output diff --git a/vllm/model_executor/models/internlm.py b/vllm/model_executor/models/internlm.py index 5d0b93793c89d..0ba01243e4eac 100644 --- a/vllm/model_executor/models/internlm.py +++ b/vllm/model_executor/models/internlm.py @@ -24,7 +24,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class InternLMMLP(nn.Module): @@ -114,8 +114,9 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index ebf1d8a89a022..ac19238992d4d 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -24,7 +24,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class InternLM2MLP(nn.Module): @@ -129,8 +129,9 @@ def forward( qkv, _ = self.wqkv(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.wo(attn_output) return output diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index e5a1abebf1420..acdd30d3d75b4 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -47,7 +47,7 @@ from vllm.sequence import SamplerOutput from vllm.config import LoRAConfig -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class LlamaMLP(nn.Module): @@ -152,8 +152,9 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/mistral.py b/vllm/model_executor/models/mistral.py index 01cde67844122..29dfe0e0e0903 100644 --- a/vllm/model_executor/models/mistral.py +++ b/vllm/model_executor/models/mistral.py @@ -47,7 +47,7 @@ from vllm.sequence import SamplerOutput from vllm.config import LoRAConfig -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class MistralMLP(nn.Module): @@ -150,8 +150,9 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index a8e470395b904..fd6e260177f7e 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -51,7 +51,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class MixtralMoE(nn.Module): @@ -222,8 +222,9 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index a8dadce24aa1d..b72bfe869be14 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -51,7 +51,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class MixtralMLP(nn.Module): @@ -232,8 +232,9 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 22a876e2ef691..28bdc3301c67f 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -24,7 +24,7 @@ from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs.mpt import MPTConfig -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] def _get_alibi_slopes( @@ -126,8 +126,9 @@ def forward( if self.qk_ln: q = self.q_ln(q) k = self.k_ln(k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.out_proj(attn_output) return output diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 393b2dcabcd5a..0caa424fb2a36 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -41,7 +41,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class OPTLearnedPositionalEmbedding(nn.Embedding): @@ -101,9 +101,9 @@ def forward( ) -> torch.Tensor: qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) - key_cache, value_cache = kv_cache + key_cache, value_cache, kv_cache_scaling_factor = kv_cache attn_output = self.attn(q, k, v, key_cache, value_cache, - input_metadata) + kv_cache_scaling_factor, input_metadata) output, _ = self.out_proj(attn_output) return output diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index d143261968288..b6591f51958cb 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -59,7 +59,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class PhiAttention(nn.Module): @@ -120,8 +120,9 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) q, k = self.rotary_emb(position_ids, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.dense(attn_output) return output diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index fbc7320fb45a4..b370125b64796 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -29,7 +29,7 @@ from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs.qwen import QWenConfig -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class QWenMLP(nn.Module): @@ -116,8 +116,9 @@ def forward( qkv, _ = self.c_attn(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.c_proj(attn_output) return output diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index e823e6f8c3dbe..9bbfd4864e743 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -47,7 +47,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class Qwen2MLP(nn.Module): @@ -151,8 +151,9 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 95e5ad8ede63e..8112c610f23f8 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -41,7 +41,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class StablelmMLP(nn.Module): @@ -136,8 +136,9 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/yi.py b/vllm/model_executor/models/yi.py index 53daa6c4cd939..1dfe566097ed0 100644 --- a/vllm/model_executor/models/yi.py +++ b/vllm/model_executor/models/yi.py @@ -46,7 +46,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class YiMLP(nn.Module): @@ -150,8 +150,9 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index 8e6f7a174f219..0583fd3d930ef 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -262,6 +262,22 @@ def hf_model_weights_iterator( torch.cuda.empty_cache() +def kv_cache_scales_iterator(filename: str) -> Iterator[Tuple[int, torch.Tensor]]: + """ + A simple utility to read in KV cache scaling factors that have been + previously serialized to disk. Used by the CacheEngine to populate its + caches with the appropriate scaling factors. The first object of the pair + is the cache (and model) layer corresponding to the scaling factor, and the + second is the scaling factor itself. Keep this function in sync with the output + of 3rdparty/quantization/extract_scales.py and with the scaling factor structure + assumed to hold in worker/cache_engine.py + """ + with open(filename) as f: + layer_scale_factor_map = json.load(f, parse_int=int, parse_constant=float) + for layer_idx, scale_factor in layer_scale_factor_map.items(): + yield int(layer_idx), float(scale_factor) + + def convert_pyslice_to_tensor(x: Any) -> torch.Tensor: """convert PySafeSlice object from safetensors to torch.Tensor diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index f57e1ed75803d..9b177c8ef3671 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -1,16 +1,17 @@ """CacheEngine class for managing the KV cache.""" -from typing import Dict, List, Tuple +from typing import Dict, List, Optional, Tuple import torch from vllm._C import cache_ops from vllm.config import CacheConfig, ModelConfig, ParallelConfig from vllm.logger import init_logger +from vllm.model_executor.weight_utils import kv_cache_scales_iterator from vllm.utils import in_wsl, STR_DTYPE_TO_TORCH_DTYPE logger = init_logger(__name__) -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class CacheEngine: @@ -43,11 +44,20 @@ def __init__( self.dtype = model_config.dtype else: self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] + + # We enable cache scaling factors if and only if cache is FP8-typed + self.use_scaling_factor = torch.tensor([], dtype=self.dtype).element_size() == 1 # Initialize the cache. self.gpu_cache = self.allocate_gpu_cache() self.cpu_cache = self.allocate_cpu_cache() + # Load scaling factors into the GPU cache if values are specified + # We do not need to load them into the CPU cache because they are + # never swapped out. + if self.cache_config.kv_cache_scales is not None: + self.load_kv_cache_scales(self.cache_config.kv_cache_scales) + # Initialize the stream for caching operations. self.cache_stream = torch.cuda.Stream() assert self.cache_stream != torch.cuda.current_stream() @@ -86,7 +96,15 @@ def allocate_gpu_cache(self) -> List[KVCache]: dtype=self.dtype, device="cuda", ) - gpu_cache.append((key_blocks, value_blocks)) + if self.use_scaling_factor: + scaling_factor = torch.ones( + 1, + dtype=torch.float32, + device='cuda', + ) + else: + scaling_factor = None + gpu_cache.append((key_blocks, value_blocks, scaling_factor)) return gpu_cache def allocate_cpu_cache(self) -> List[KVCache]: @@ -110,7 +128,9 @@ def allocate_cpu_cache(self) -> List[KVCache]: dtype=self.dtype, pin_memory=pin_memory, ) - cpu_cache.append((key_blocks, value_blocks)) + # Scale factors are not involved in the swap process and never need to reside on CPU + scaling_factor = None + cpu_cache.append((key_blocks, value_blocks, scaling_factor)) return cpu_cache def _swap( @@ -121,8 +141,13 @@ def _swap( ) -> None: with torch.cuda.stream(self.cache_stream): for i in range(self.num_layers): - src_key_cache, src_value_cache = src[i] - dst_key_cache, dst_value_cache = dst[i] + src_key_cache, src_value_cache, src_scaling = src[i] + dst_key_cache, dst_value_cache, dst_scaling = dst[i] + # We should not need to copy scaling factors, as they are equal + # given a fixed layer + # TODO(mattwong) Remove this once confirmed + if self.use_scaling_factor: + assert torch.equal(src_scaling, dst_scaling) # Copy the key blocks. cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst) # Copy the value blocks. @@ -138,10 +163,18 @@ def swap_out(self, src_to_dst: Dict[int, int]) -> None: self._swap(self.gpu_cache, self.cpu_cache, src_to_dst) def copy(self, src_to_dsts: Dict[int, List[int]]) -> None: - key_caches = [key_cache for key_cache, _ in self.gpu_cache] - value_caches = [value_cache for _, value_cache in self.gpu_cache] + key_caches = [key_cache for key_cache, _, _ in self.gpu_cache] + value_caches = [value_cache for _, value_cache, _ in self.gpu_cache] # NOTE(woosuk): This operation implicitly synchronizes the CPU and GPU. cache_ops.copy_blocks(key_caches, value_caches, src_to_dsts) + + # Helper function to load in static KV cache scaling factors (one per layer) + # stored in a given file. These scaling factors are assumed to not take up + # too much space and are hence permanently resident on GPU. + def load_kv_cache_scales(self, filename: str): + for layer_idx, scaling_factor in kv_cache_scales_iterator(filename): + self.gpu_cache[layer_idx][2].copy_(scaling_factor) + @staticmethod def get_cache_block_size( @@ -162,6 +195,11 @@ def get_cache_block_size( else: dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype] dtype_size = _get_dtype_size(dtype) + + use_scaling_factor = dtype_size == 1 + if use_scaling_factor: + return dtype_size * total + num_layers * 4 + return dtype_size * total diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 2df9fd5215a2d..3d63ab8613a79 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -20,7 +20,7 @@ logger = init_logger(__name__) -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] _PAD_SLOT_ID = -1 LORA_WARMUP_RANK = 8 # Capture graphs for batch size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256. @@ -517,7 +517,7 @@ def prepare_input_tensors( def execute_model( self, seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], + kv_caches: List[KVCache], ) -> Optional[SamplerOutput]: input_tokens, input_positions, input_metadata, sampling_metadata, lora_requests, lora_mapping = ( self.prepare_input_tensors(seq_group_metadata_list)) @@ -595,7 +595,7 @@ def profile_run(self) -> None: # Run the model with the dummy inputs. num_layers = self.model_config.get_num_layers(self.parallel_config) - kv_caches = [(None, None)] * num_layers + kv_caches = [(None, None, None)] * num_layers self.execute_model(seqs, kv_caches) torch.cuda.synchronize() return @@ -753,7 +753,7 @@ def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], + kv_caches: List[KVCache], input_metadata: InputMetadata, ) -> torch.Tensor: # KV caches are fixed tensors, so we don't need to copy them. From af9e9d17575b8160c8bf352697ea9efd936bb45c Mon Sep 17 00:00:00 2001 From: Doug Lehr Date: Wed, 14 Feb 2024 16:44:33 +0000 Subject: [PATCH 026/159] Add silu gemm fusion when batch and seq_len = 1 --- vllm/model_executor/layers/linear.py | 2 -- vllm/model_executor/layers/tuned_gemm.py | 5 ++--- vllm/model_executor/models/llama.py | 13 +++++++++++-- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 63ea7b856c1ee..e88ec167b5e3e 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -68,8 +68,6 @@ def apply_weights(self, bias: Optional[torch.Tensor] = None) -> torch.Tensor: weight = weights["weight"] if self.separate_bias_add: - #print(f">>> HELOOOOOOOOOOOOOO apply_weights {x.shape}, {weight.shape}, {bias}") - if bias: return F.linear(x, weight) + bias return F.linear(x, weight) diff --git a/vllm/model_executor/layers/tuned_gemm.py b/vllm/model_executor/layers/tuned_gemm.py index 84994d7f9daeb..0cc7b866dd219 100644 --- a/vllm/model_executor/layers/tuned_gemm.py +++ b/vllm/model_executor/layers/tuned_gemm.py @@ -29,7 +29,6 @@ def load_best_sols(self): def apply_custom(self,ds): M,N,K = ds['M'],ds['N'],ds['K'] #apply custom matvec (only for f16 dtype) - #return ds if N==1: ds1 = ds.copy() ds1['libtype'] = 'custom' @@ -93,11 +92,11 @@ def mm(self,inp,weights): #print(">>> found rocblas") out = rocb_mm(inp_view,weights.t(),solidx) else: - print('>>>Tgemm Default',inp.shape,weights.shape,soltype,solidx) + #print('>>>Tgemm Default',inp.shape,weights.shape,soltype,solidx) out = F.linear(inp,weights) if batched: return out.view(inp.shape[0], inp.shape[1], weights.shape[0]) else: return out -tgemm = TunedGemm() \ No newline at end of file +tgemm = TunedGemm() diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index e5a1abebf1420..52752e64e5d48 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -46,6 +46,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput from vllm.config import LoRAConfig +from vllm import custom_ops KVCache = Tuple[torch.Tensor, torch.Tensor] @@ -74,8 +75,16 @@ def __init__( self.act_fn = SiluAndMul() def forward(self, x): - gate_up, _ = self.gate_up_proj(x) - x = self.act_fn(gate_up) + #print(f'>>>Shape of x in mlp {x.shape} {self.gate_up_proj.weight.shape}') + if x.shape[0] == 1 and x.shape[1] == 1: + + out = torch.empty(x.shape[0],self.gate_up_proj.weight.shape[0]//2,dtype=x.dtype,device=x.device) + custom_ops.LLMM_Silu(self.gate_up_proj.weight,x.view(-1,x.size(-1)),out,8) + x = out.view(x.shape[0], x.shape[1], out.shape[1]) + else: + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + #print(f'>>> x.shape {x.shape}') x, _ = self.down_proj(x) return x From 5f8eac35085a16d15b903009a5a381920eaa03a3 Mon Sep 17 00:00:00 2001 From: Doug Lehr Date: Wed, 14 Feb 2024 17:52:42 +0000 Subject: [PATCH 027/159] Add tunable flags to VLLM --- run_70b.sh | 109 +++++++++++++++++++++++ run_70b_fast.sh | 69 ++++++++++++++ vllm/model_executor/layers/tuned_gemm.py | 20 +++-- 3 files changed, 190 insertions(+), 8 deletions(-) create mode 100644 run_70b.sh create mode 100644 run_70b_fast.sh diff --git a/run_70b.sh b/run_70b.sh new file mode 100644 index 0000000000000..46e342826b2a7 --- /dev/null +++ b/run_70b.sh @@ -0,0 +1,109 @@ +#!/bin/bash +BASE_DIR=/workspace +VLLM_DIR=$BASE_DIR/vllm-private +GRAD_DIR=$BASE_DIR/gradlib +RPD_DIR=/workspace/rocmProfileData +MODEL=/data/llama2-70b-chat +MODEL_SIZE=`echo $MODEL | sed 's/.*\(.[0-9][bB]\).*/\1/'` +#MODEL=/data/llama-2-13b-chat-hf +GEMM_TUNER=1 +#TP="1 2 4 8" +TP=8 +#Flag to use Triton Flash Attention vs CK +export VLLM_USE_TRITON=1 + +#Gemm tuner flags +export VLLM_TUNE_GEMM=0 +export VLLM_UNTUNE_FILE="/tmp/vllm_untuned.csv" +export VLLM_TUNE_FILE=$VLLM_DIR"/tuned.csv" + +#Flag to use old torch.multinomial +#export VLLM_USE_TORCH_MULTINOMIAL=1 + +#Delete tuned gemms before running. +DELETE_TUNED_CSV=1 +#Flag to disable MSCCL +#export RCCL_MSCCL_ENABLE=0 +#HIPGraph performance flags +export HIP_FORCE_DEV_KERNARG=1 +export DEBUG_CLR_GRAPH_PACKET_CAPTURE=1 +#Enable full decoder graph mode +HIP_GRAPH=--use-cuda-graph +#Use top of tree build of RCCL +export LD_LIBRARY_PATH=/workspace/rccl/build/ +#Enable either flag to create a profile trace (rocprof, or rocpd) +#RPD_PROFILE="--profile" +#ROCPROF_PROFILE="rocprof --hip-trace" +GEN_LEN="1 32" +#INPUT_LEN="512 1024 2048 3072" +INPUT_LEN="512 1024 2048 3072 4096 6144 8192 16384" +ITER=10 +# pring usage of the parameters +usage() { + echo "Usage: $0 [--tp ] [--model ]" + exit 1 +} +# parse parameters +while [[ "$#" -gt 0 ]]; do + case $1 in + --tp) TP="$2"; shift ;; + --model) MODEL="$2"; shift ;; + --notune) GEMM_TUNER=0; shift ;; + *) usage ;; # Any other argument will show usage information. + esac + shift # Move to next argument +done +for tp in $TP; +do + if (( $GEMM_TUNER )); + then + echo "tuned_gemm_csv: ./tuned_tp$tp.csv" > $VLLM_DIR/tuned_perf_tp$tp.yaml + tuned_file=$VLLM_DIR/tuned_tp$tp.csv + if [[ $DELETE_TUNED_CSV == 1 || ! -f $VLLM_DIR/tuned_tp$tp.csv ]]; + echo "tuned_gemm_csv: "$VLLM_TUNE_FILE > $VLLM_DIR/tuned_perf.yaml + if [[ $DELETE_TUNED_CSV == 1 ]]; + then + rm -rf $tuned_file + echo "INFO: Generating Tuned Gemm configs" + cd $GRAD_DIR + python gemm_tuner.py --model_dir $MODEL --output $tuned_file --tp $tp + fi + export VLLM_PERF_YAML=./tuned_perf_tp$tp.yaml + echo "INFO: Generating Tuned Gemm configs" + cd $GRAD_DIR + python gemm_tuner.py --model_dir $MODEL --output $VLLM_TUNE_FILE --tp $tp + + + echo "================================= TUNED GEMMS $tuned_file ===============================================" + cat $tuned_file + + fi + + cd $VLLM_DIR + for gen_len in $GEN_LEN; + do + for input_len in $INPUT_LEN; + do + if [[ -v RPD_PROFILE ]] ; + then + rm /workspace/trace.rpd + python -m rocpd.schema --create /workspace/trace.rpd + fi + echo "================================= RUNNING $MODEL $input_len $gen_len ===============================================" + $ROCPROF_PROFILE torchrun --standalone --nnodes=1 --nproc-per-node=$tp benchmarks/benchmark_latency.py --model $MODEL --batch-size 1 --input-len $input_len --output-len $gen_len \ + --tensor-parallel-size $tp --num-iters $ITER $HIP_GRAPH $RPD_PROFILE + if [[ -v ROCPROF_PROFILE ]] ; + then + TRACE_FILE=$BASE_DIR/trace_${MODEL_SIZE}_${input_len}_${gen_len}.json + echo "INFO: Creating Trace JSON file $TRACE_FILE" + mv $VLLM_DIR/results.json $TRACE_FILE + fi + if [[ -v RPD_PROFILE ]] ; + then + TRACE_FILE=$BASE_DIR/trace_${MODEL_SIZE}_${input_len}_${gen_len}.json + echo "INFO: Creating Trace JSON file $TRACE_FILE" + python $RPD_DIR/tools/rpd2tracing.py --format object $BASE_DIR/trace.rpd $TRACE_FILE + fi + done + done +done \ No newline at end of file diff --git a/run_70b_fast.sh b/run_70b_fast.sh new file mode 100644 index 0000000000000..585e0ebdd000c --- /dev/null +++ b/run_70b_fast.sh @@ -0,0 +1,69 @@ +#!/bin/bash +set -e +BASE_DIR=/workspace +VLLM_DIR=$BASE_DIR/vllm-private +GRAD_DIR=/trees/gradlib +RPD_DIR=/workspace/rocmProfileData +MODEL=/data/llama2-70b-chat +MODEL_SIZE=`echo $MODEL | sed 's/.*\(.[0-9][bB]\).*/\1/'` + +export VLLM_TUNE_GEMM=0 +export VLLM_UNTUNE_FILE="/tmp/vllm_untuned.csv" +export VLLM_TUNE_FILE=$VLLM_DIR/"tuned.csv" + +#Flag to use Triton Flash Attention vs CK +export VLLM_USE_TRITON=1 + +#Flag to use old torch.multinomial +#export VLLM_USE_TORCH_MULTINOMIAL=1 + +#Delete tuned gemms before running. +#DELETE_TUNED_CSV=1 + +#Flag to disable MSCCL +#export RCCL_MSCCL_ENABLE=0 + +#HIPGraph performance flags +export HIP_FORCE_DEV_KERNARG=1 +export DEBUG_CLR_GRAPH_PACKET_CAPTURE=1 + +#Enable full decoder graph mode +HIP_GRAPH=--use-cuda-graph + +#Use top of tree build of RCCL +export LD_LIBRARY_PATH=/workspace/rccl/build/ + +#Enable either flag to create a profile trace (rocprof, or rocpd) +#RPD_PROFILE="--profile" +#ROCPROF_PROFILE="rocprof --hip-trace" + +#TP="1 2 4 8" +TP=8 +GEN_LEN="1,32" +INPUT_LEN="512 1024 2048 3072" +#INPUT_LEN="512,1024,2048,3072,4096,6144,8192,16384" +BATCH_SIZE="1" +ITER=10 + +rm -f $VLLM_UNTUNE_FILE +for tp in $TP; +do + cd $VLLM_DIR + export VLLM_TUNE_GEMM=1 + echo "================================= WARMING UP $MODEL ===============================================" + $ROCPROF_PROFILE torchrun --standalone --nnodes=1 --nproc-per-node=$tp benchmarks/benchmark_latency.py --model $MODEL --batch-size $BATCH_SIZE --input-len $INPUT_LEN --output-len $GEN_LEN \ + --tensor-parallel-size $tp --num-iters 1 --warmup-only + + if [ -f $VLLM_UNTUNE_FILE ]; then + echo "=============================== Tuning ======================================" + python $GRAD_DIR/gemm_tuner.py --tuned_file $VLLM_TUNE_FILE --input_file $VLLM_UNTUNE_FILE + echo "File does not exist." + fi + echo "================================= TUNED GEMMS $tuned_file ===============================================" + cat $VLLM_TUNE_FILE + + export VLLM_TUNE_GEMM=0 + echo "================================= RUNNING $MODEL ===============================================" + $ROCPROF_PROFILE torchrun --standalone --nnodes=1 --nproc-per-node=$tp benchmarks/benchmark_latency.py --model $MODEL --batch-size $BATCH_SIZE --input-len $INPUT_LEN --output-len $GEN_LEN \ + --tensor-parallel-size $tp --num-iters $ITER --report --report-file=$VLLM_DIR/report.csv $HIP_GRAPH +done \ No newline at end of file diff --git a/vllm/model_executor/layers/tuned_gemm.py b/vllm/model_executor/layers/tuned_gemm.py index 0cc7b866dd219..6d0a3ed3f2a60 100644 --- a/vllm/model_executor/layers/tuned_gemm.py +++ b/vllm/model_executor/layers/tuned_gemm.py @@ -2,6 +2,7 @@ import torch.nn.functional as F from rocsolidxgemm import rocb_create_extension,rocb_mm from hipbsolidxgemm import hipb_create_extension,hipb_mm +from pathlib import Path import os import yaml import pandas as pd @@ -16,16 +17,19 @@ def __init__(self): self.bestsols = {} self.load_best_sols() self.create_ds() + self.save_gemm = int(os.environ.get('VLLM_TUNE_GEMM',0)) + self.untune_path = os.environ.get('VLLM_UNTUNE_FILE', "/tmp/vllm_untuned.csv") + self.tune_path = os.environ.get('VLLM_TUNE_FILE', "tuned.csv") + + if (self.save_gemm == 1): + self.tuned_df = pd.DataFrame(columns=['M','N','K']) + else: + self.tuned_df = None + def load_best_sols(self): - perfbits = {} - perf_file = os.environ.get('VLLM_PERF_YAML') - if perf_file is not None: - with open(perf_file, 'r') as file: - perfbits = yaml.safe_load(file) + if self.tune_path is not None and Path(self.tune_path).is_file(): + self.bestsols = pd.read_csv(self.tune_path) - tune_file = perfbits.get('tuned_gemm_csv',None) - if tune_file is not None: - self.bestsols = pd.read_csv(tune_file,index_col=[0]) def apply_custom(self,ds): M,N,K = ds['M'],ds['N'],ds['K'] #apply custom matvec (only for f16 dtype) From 22766b48ecb18c037dc49004717d66a9027fcee2 Mon Sep 17 00:00:00 2001 From: Doug Lehr Date: Wed, 14 Feb 2024 23:51:08 +0000 Subject: [PATCH 028/159] Allow benchmark_latency to take a list of input/output/batches for faster execution Also add reporting functionality for easy display --- benchmarks/benchmark_latency.py | 118 ++++++++++++++++++-------------- 1 file changed, 67 insertions(+), 51 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index d75d690cc66d4..f9b49ebfaa132 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -3,17 +3,23 @@ import time from pathlib import Path from typing import Optional - +import pandas as pd import numpy as np import torch from tqdm import tqdm from vllm import LLM, SamplingParams +from torch.profiler import profile, record_function, ProfilerActivity +def list_of_ints(arg): + return list(map(int, arg.split(','))) def main(args: argparse.Namespace): print(args) + print(f'>>>Loading LLM') + if args.report: + results_df = pd.DataFrame(columns=['model', 'batch', 'tp', 'input', 'output', 'latency']) # NOTE(woosuk): If the request cannot be processed in a single batch, # the engine will automatically process the request in multiple batches. llm = LLM( @@ -26,57 +32,62 @@ def main(args: argparse.Namespace): enforce_eager=args.enforce_eager, ) - sampling_params = SamplingParams( - n=args.n, - temperature=0.0 if args.use_beam_search else 1.0, - top_p=1.0, - use_beam_search=args.use_beam_search, - ignore_eos=True, - max_tokens=args.output_len, - ) - print(sampling_params) - dummy_prompt_token_ids = [[0] * args.input_len] * args.batch_size + for batch_size in args.batch_size: + for output_len in args.output_len: + for input_len in args.input_len: + print(f'>>>RUNNING {args.model} Batch_size:{batch_size} Input_len:{input_len} Output_len:{output_len}') + sampling_params = SamplingParams( + n=args.n, + temperature=0.0 if args.use_beam_search else 1.0, + top_p=1.0, + use_beam_search=args.use_beam_search, + ignore_eos=True, + max_tokens=output_len, + ) + print(sampling_params) + dummy_prompt_token_ids = [[0] * input_len] * batch_size + + def run_to_completion(profile_dir: Optional[str] = None): + if profile_dir: + with torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + on_trace_ready=torch.profiler.tensorboard_trace_handler( + str(profile_dir))) as p: + llm.generate(prompt_token_ids=dummy_prompt_token_ids, + sampling_params=sampling_params, + use_tqdm=False) + print(p.key_averages()) + else: + start_time = time.perf_counter() + llm.generate(prompt_token_ids=dummy_prompt_token_ids, + sampling_params=sampling_params, + use_tqdm=False) + end_time = time.perf_counter() + latency = end_time - start_time + return latency - def run_to_completion(profile_dir: Optional[str] = None): - if profile_dir: - with torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - on_trace_ready=torch.profiler.tensorboard_trace_handler( - str(profile_dir))) as p: - llm.generate(prompt_token_ids=dummy_prompt_token_ids, - sampling_params=sampling_params, - use_tqdm=False) - print(p.key_averages()) - else: - start_time = time.perf_counter() - llm.generate(prompt_token_ids=dummy_prompt_token_ids, - sampling_params=sampling_params, - use_tqdm=False) - end_time = time.perf_counter() - latency = end_time - start_time - return latency + print("Warming up...") + run_to_completion(profile_dir=None) - print("Warming up...") - run_to_completion(profile_dir=None) + if args.profile: + profile_dir = args.profile_result_dir + if not profile_dir: + profile_dir = Path( + "." + ) / "vllm_benchmark_result" / f"latency_result_{time.time()}" + print(f"Profiling (results will be saved to '{profile_dir}')...") + run_to_completion(profile_dir=args.profile_result_dir) + return - if args.profile: - profile_dir = args.profile_result_dir - if not profile_dir: - profile_dir = Path( - "." - ) / "vllm_benchmark_result" / f"latency_result_{time.time()}" - print(f"Profiling (results will be saved to '{profile_dir}')...") - run_to_completion(profile_dir=args.profile_result_dir) - return + # Benchmark. + latencies = [] + for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): + latencies.append(run_to_completion(profile_dir=None)) + print(f'Avg latency: {np.mean(latencies)} seconds') - # Benchmark. - latencies = [] - for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): - latencies.append(run_to_completion(profile_dir=None)) - print(f'Avg latency: {np.mean(latencies)} seconds') if __name__ == '__main__': @@ -90,9 +101,9 @@ def run_to_completion(profile_dir: Optional[str] = None): choices=['awq', 'gptq', 'squeezellm', None], default=None) parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) - parser.add_argument('--input-len', type=int, default=32) - parser.add_argument('--output-len', type=int, default=128) - parser.add_argument('--batch-size', type=int, default=8) + parser.add_argument('--input-len', type=list_of_ints, default=32) + parser.add_argument('--output-len', type=list_of_ints, default=128) + parser.add_argument('--batch-size', type=list_of_ints, default=8) parser.add_argument('--n', type=int, default=1, @@ -127,5 +138,10 @@ def run_to_completion(profile_dir: Optional[str] = None): default=None, help=('path to save the pytorch profiler output. Can be visualized ' 'with ui.perfetto.dev or Tensorboard.')) + parser.add_argument('--warmup-only', action='store_true', + help='only run warmup, useful for tuning') + parser.add_argument('--report', action='store_true', + help='turn on dataframe reporting') + parser.add_argument('--report-file', type=str, default=None) args = parser.parse_args() main(args) From 87b4c1bd94a03a9d706356ea4f9d39a4514074f7 Mon Sep 17 00:00:00 2001 From: Doug Lehr Date: Thu, 15 Feb 2024 06:12:26 +0000 Subject: [PATCH 029/159] Add dynamic tuning feature to vllm --- gradlib/csrc/grad_funcs.cu | 413 +++++++++++++++ gradlib/csrc/hipbsolgemm.cu | 610 +++++++++++++++++++++++ gradlib/csrc/rocsolgemm.cu | 563 +++++++++++++++++++++ gradlib/gemm_runner.py | 62 +++ gradlib/gemm_tuner.py | 92 ++++ gradlib/gradlib/GemmTuner.py | 208 ++++++++ gradlib/mm_test.py | 234 +++++++++ gradlib/setup.py | 136 +++++ run_70b.sh | 39 +- run_70b_fast.sh | 16 +- vllm/model_executor/layers/tuned_gemm.py | 7 +- 11 files changed, 2342 insertions(+), 38 deletions(-) create mode 100644 gradlib/csrc/grad_funcs.cu create mode 100644 gradlib/csrc/hipbsolgemm.cu create mode 100644 gradlib/csrc/rocsolgemm.cu create mode 100644 gradlib/gemm_runner.py create mode 100644 gradlib/gemm_tuner.py create mode 100644 gradlib/gradlib/GemmTuner.py create mode 100644 gradlib/mm_test.py create mode 100644 gradlib/setup.py mode change 100644 => 100755 run_70b.sh mode change 100644 => 100755 run_70b_fast.sh diff --git a/gradlib/csrc/grad_funcs.cu b/gradlib/csrc/grad_funcs.cu new file mode 100644 index 0000000000000..f6498fb2a3ba7 --- /dev/null +++ b/gradlib/csrc/grad_funcs.cu @@ -0,0 +1,413 @@ +// #ifdef __gfx908__ +// // Uncomment ifdef and endif only if you need to undef the HIP_HALF ops below just for gfx908 and not for others +// // below lines enable hip float to half conversion which are disabled by default in hip_fp16.h +// #undef __HIP_NO_HALF_OPERATORS__ +// #undef __HIP_NO_HALF_CONVERSIONS__ +// #endif + +#include +#include +#include +#include +#include +#include +#include +#include +// #include +#include +#include +#include +#include + +#include +//#include +#include + +#include +#include +#include +#include +#include +#include +#include "nvToolsExt.h" + +// #ifdef USE_ROCM +// #define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR) +// #define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242) +// #endif + +// #ifdef __HIP_PLATFORM_HCC__ +// #define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR) +// #define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242) +// #if USE_GEMM_FLAGS_FP16_ALT_IMPL +// #ifdef ROCM_BACKWARD_PASS_GUARD +// flag = at::BackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0; +// #endif +// #endif +// #endif + +#ifndef CHECK_HIP_ERROR +#define CHECK_HIP_ERROR(error) \ + if(error != hipSuccess) \ + { \ + fprintf(stderr, \ + "Hip error: '%s'(%d) at %s:%d\n", \ + hipGetErrorString(error), \ + error, \ + __FILE__, \ + __LINE__); \ + exit(EXIT_FAILURE); \ + } +#endif + +#ifndef CHECK_HIPBLAS_ERROR +#define CHECK_HIPBLAS_ERROR(error) \ + if(error != HIPBLAS_STATUS_SUCCESS) \ + { \ + fprintf(stderr, \ + "hipBLAS error: '%s'(%d) at %s:%d\n", \ + hipblasStatusToString(error), \ + error, \ + __FILE__, \ + __LINE__); \ + exit(EXIT_FAILURE); \ + } +#endif + +namespace { + /*thread_local*/ cudaStream_t weight_stream; + // BUG: DLM has event and stream on different devices error + // In multi-GPU scenerio, do names defined in this namespace exist on all devices? + // C++ keyword: thread_local <- maybe this can help? + /*thread_local*/ cudaEvent_t event; + + // hipBLASLt + hipblasLtHandle_t hipblaslt_handle; + hipblasLtMatmulPreference_t preference; + uint64_t workspace_size = 32*1024*1024; + //uint64_t workspace_size = 0; + void* d_workspace; + int request_solutions = 1; + int returnedAlgoCount = 0; + + struct MatMulConfig { + hipblasOperation_t op_A; + hipblasOperation_t op_B; + int M; + int N; + int K; + hipblasDatatype_t dtype; + + friend auto operator<(const MatMulConfig& left, const MatMulConfig& right) -> bool { + return std::tie(left.op_A, left.op_B, left.M, left.N, left.K, left.dtype) < std::tie(right.op_A, right.op_B, right.M, right.N, right.K, right.dtype); + } + }; + + // std::map, std::vector> heuristic_map; + std::map heuristic_map; + + hipEvent_t start, stop; + int bench_iters { 1 }; + int warmup_iters { 1 }; + + bool cout_print = true; +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////// +/** + * hipBLASLt GEMM call +*/ +hipblasStatus_t hipblasLtMatmul_wrapper( + hipblasLtHandle_t handle, + hipblasOperation_t op_A, + hipblasOperation_t op_B, + int m, int n, int k, + const void *alpha, + const void *a, + int lda, + const void *b, + int ldb, + const void *beta, + void *c, + int ldc, + hipblasDatatype_t dtype, + hipStream_t &stream) +{ + // TODO: flag is not supported for hipblasLt yet + int flag { 0 }; + if (dtype == HIPBLAS_R_16F) { + // use fp16 alt impl for MI200 + // https://pytorch.org/docs/stable/notes/numerical_accuracy.html#reduced-precision-fp16-and-bf16-gemms-and-convolutions-on-amd-instinct-mi200-devices + flag = rocblas_gemm_flags_fp16_alt_impl; + } + + nvtxRangePushA("hipBLASLt variables creation"); + hipblasLtMatrixLayout_t matA, matB, matC; + hipblasLtMatmulDesc_t matmul; + if (op_A == HIPBLAS_OP_N) { + CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, m, k, lda)); + } else { + CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, k, m, lda)); + } + if (op_B == HIPBLAS_OP_N) { + CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, k, n, ldb)); + } else { + CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, n, k, ldb)); + } + CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matC, dtype, m, n, ldc)); + CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescCreate(&matmul, HIPBLASLT_COMPUTE_F32, HIPBLAS_R_32F)); + CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute( + matmul, HIPBLASLT_MATMUL_DESC_TRANSA, &op_A, sizeof(int32_t))); + CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute( + matmul, HIPBLASLT_MATMUL_DESC_TRANSB, &op_B, sizeof(int32_t))); + nvtxRangePop(); + + // if heuristic does not exist in the map, do search and push into the map + auto gemm_key { MatMulConfig { op_A, op_B, m, n, k, dtype } }; + if (heuristic_map.count(gemm_key) <= 0) { + nvtxRangePushA("hipblasLtMatmulAlgoGetHeuristic"); + if (cout_print) { + std::cout << (op_A == HIPBLAS_OP_N ? "N" : "T") << (op_B == HIPBLAS_OP_N ? "N" : "T") + << " (" << m << ", " << n << ", " << k << "), dtype: " << dtype + << ", (lda, ldb, ldc): (" << lda << ", " << ldb << ", " << ldc << "), " << std::endl; + } + std::vector heuristicResult(request_solutions); + CHECK_HIPBLAS_ERROR(hipblasLtMatmulAlgoGetHeuristic( + handle, matmul, matA, matB, matC, matC, + preference, request_solutions, heuristicResult.data(), &returnedAlgoCount)); + if((returnedAlgoCount != request_solutions) && cout_print) { + std::cout << "less solution found! request: " << request_solutions + << ", found: " << returnedAlgoCount << std::endl; + } + + if (returnedAlgoCount == 1) { + heuristic_map[gemm_key] = heuristicResult[0]; + } else { + // benchmark requested solutions and pick best one + int bestIndex { -1 }; + double bestMs { std::numeric_limits::max() }; + for (int sol { 0 }; sol < returnedAlgoCount; ++sol) { + // warm up + for (int iter { 0 }; iter < warmup_iters; ++iter) { + CHECK_HIPBLAS_ERROR(hipblasLtMatmul(handle, matmul, + alpha, + a, matA, + b, matB, + beta, + c, matC, + c, matC, // In case beta != 0, these runs can overwrite the values in c + // since c and d are the same + // TODO: allocates separate d memory for these runs + &heuristicResult[sol].algo, + d_workspace, workspace_size, + stream)); + } + // performance measuring + double eventMs; + CHECK_HIP_ERROR(hipEventRecord(start, stream)); + for (int iter { 0 }; iter < bench_iters; ++iter) { + CHECK_HIPBLAS_ERROR(hipblasLtMatmul(handle, matmul, + alpha, + a, matA, + b, matB, + beta, + c, matC, + c, matC, // In case beta != 0, these runs can overwrite the values in c + // since c and d are the same + // TODO: allocates separate d memory for these runs + &heuristicResult[sol].algo, + d_workspace, workspace_size, + stream)); + } + CHECK_HIP_ERROR(hipEventRecord(stop, stream)); + CHECK_HIP_ERROR(hipEventSynchronize(stop)); + float temp; + CHECK_HIP_ERROR(hipEventElapsedTime(&temp, start, stop)); + eventMs = double(temp); + eventMs /= bench_iters; + + if (cout_print) { + std::cout << " Sol " << sol << ": average time per iter " << std::to_string(eventMs) << " ms"; + } + if (bestMs > eventMs) { + bestMs = eventMs; + bestIndex = sol; + if (cout_print) { + std::cout << " *" << std::endl; + } + } else { + if (cout_print) { + std::cout << std::endl; + } + } + } + heuristic_map[gemm_key] = heuristicResult[bestIndex]; + } + nvtxRangePop(); + } + + hipblasStatus_t status = hipblasLtMatmul(handle, matmul, + alpha, + a, matA, + b, matB, + beta, + c, matC, + c, matC, + &heuristic_map[gemm_key].algo, + d_workspace, workspace_size, + stream); + + nvtxRangePushA("hipBLASLt variables deletion"); + CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescDestroy(matmul)); + CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matA)); + CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matB)); + CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matC)); + nvtxRangePop(); + + return status; +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////// +torch::Tensor hipBLASLtMm_( + const torch::Tensor& mat1, + const torch::Tensor& mat2) +{ + auto mat1_strides { mat1.strides() }; + auto mat2_strides { mat2.strides() }; + auto mat1_sizes { mat1.sizes() }; + auto mat2_sizes { mat2.sizes() }; + // std::cout << " | mat1 info: size: " << mat1_sizes << " stride: " << mat1_strides << std::endl + // << " | mat2 info: size: " << mat2_sizes << " stride: " << mat2_strides << std::endl; + + TORCH_CHECK(mat1.dim() == 2 && mat2.dim() == 2, "tensors must be 2-D"); + TORCH_CHECK( + mat1.dtype() == mat2.dtype(), + "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype() + ); + TORCH_CHECK(mat1_sizes[1] == mat2_sizes[0], "mat1 dim 1 must match mat2 dim 0"); + + auto abcType { mat1.options().dtype() }; + auto options { at::TensorOptions().dtype(abcType).device(at::kCUDA) }; + auto result { torch::empty({ mat1_sizes[0], mat2_sizes[1] }, options) }; + // std::cout << " | result info: size: " << result.sizes() << " stride: " << result.strides() << std::endl; + + bool transpose_result = true; + bool transpose_mat1; + bool transpose_mat2; + if ((mat2_strides[0] == 1) && (mat2_strides[1] >= std::max(1, mat2_sizes[0]))) { + transpose_mat2 = false; + } else if ((mat2_strides[1] == 1) && (mat2_strides[0] >= std::max(1, mat2_sizes[1]))) { + transpose_mat2 = true; + } else { + assert(false && "unusual strides detected, may need to clone a contiguous tensor"); + } + if ((mat1_strides[0] == 1) && (mat1_strides[1] >= std::max(1, mat1_sizes[0]))) { + transpose_mat1 = false; + } else if ((mat1_strides[1] == 1) && (mat1_strides[0] >= std::max(1, mat1_sizes[1]))) { + transpose_mat1 = true; + } else { + assert(false && "unusual strides detected, may need to clone a contiguous tensor"); + } + + if (transpose_result) { + bool tmp = transpose_mat1; + transpose_mat1 = !transpose_mat2; + transpose_mat2 = !tmp; + mat1_strides = mat2.strides(); + mat2_strides = mat1.strides(); + mat1_sizes = mat2.sizes(); + mat2_sizes = mat1.sizes(); + } + // std::cout << " | transpose_result: " << (transpose_result ? "true" : "false") << std::endl + // << " | transpose_A: " << (transpose_mat1 ? "true" : "false") << std::endl + // << " | transpose_B: " << (transpose_mat2 ? "true" : "false") << std::endl; + // std::cout << " | A matrix: size: " << mat1_sizes << " stride: " << mat1_strides << std::endl + // << " | B matrix: size: " << mat2_sizes << " stride: " << mat2_strides << std::endl; + + float one { 1.0f }; + float zero { 0.0f }; + int64_t m = mat1_sizes[transpose_result ? 1 : 0]; + int64_t k = mat1_sizes[transpose_result ? 0 : 1]; + int64_t n = mat2_sizes[transpose_result ? 0 : 1]; + int64_t mat1_ld = mat1_strides[(transpose_mat1 == transpose_result) ? 1 : 0]; + int64_t mat2_ld = mat2_strides[(transpose_mat2 == transpose_result) ? 1 : 0]; + int64_t result_ld = result.stride(transpose_result ? 0 : 1); + // std::cout << " | (m, n, k): " << m << ", " << n << ", " << k << std::endl + // << " | (lda, ldb, ldc): " << mat1_ld << ", " << mat2_ld << ", " << result_ld << std::endl; + + int flag { 0 }; + hipblasDatatype_t hipblasType; + if (abcType == at::kHalf) { + hipblasType = HIPBLAS_R_16F; + } else if (abcType == at::kBFloat16) { + hipblasType = HIPBLAS_R_16B; + } else if (abcType == at::kFloat) { + hipblasType = HIPBLAS_R_32F; + } else { + assert(false && "Wrong datatype!"); + } + + void *ptrA { static_cast((transpose_result ? mat2 : mat1).data_ptr()) }; + void *ptrB { static_cast((transpose_result ? mat1 : mat2).data_ptr()) }; + void *ptrC { static_cast(result.data_ptr()) }; + + auto current_stream { torch::hip::getCurrentHIPStream().stream() }; + + CHECK_HIPBLAS_ERROR(hipblasLtMatmul_wrapper( + hipblaslt_handle, + transpose_mat1 ? HIPBLAS_OP_T : HIPBLAS_OP_N, + transpose_mat2 ? HIPBLAS_OP_T : HIPBLAS_OP_N, + m, n, k, + &one, + ptrA, mat1_ld, + ptrB, mat2_ld, + &zero, + ptrC, result_ld, + hipblasType, + current_stream)); + + return result; +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////// + +void create_extension() +{ + CHECK_HIP_ERROR(hipStreamCreate(&weight_stream)); + CHECK_HIP_ERROR(hipEventCreateWithFlags(&event, cudaEventDisableTiming)); + + // hipBLASLt + CHECK_HIPBLAS_ERROR(hipblasLtCreate(&hipblaslt_handle)); + CHECK_HIP_ERROR(hipMalloc(&d_workspace, workspace_size)); + CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceCreate(&preference)); + CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceSetAttribute( + preference, HIPBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_size, sizeof(workspace_size))); + + CHECK_HIP_ERROR(hipEventCreate(&start)); + CHECK_HIP_ERROR(hipEventCreate(&stop)); +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////// + +void destroy_extension() +{ + CHECK_HIP_ERROR(hipStreamDestroy(weight_stream)); + CHECK_HIP_ERROR(hipEventDestroy(event)); + + // hipBLASLt + CHECK_HIPBLAS_ERROR(hipblasLtDestroy(hipblaslt_handle)); + CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceDestroy(preference)); + CHECK_HIP_ERROR(hipFree(d_workspace)); + + CHECK_HIP_ERROR(hipEventDestroy(start)); + CHECK_HIP_ERROR(hipEventDestroy(stop)); +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////// + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) +{ + m.def("create_extension", &create_extension, "create_extension"); + m.def("destroy_extension", &destroy_extension, "destroy_extension"); + m.def("mm", &hipBLASLtMm_, "mm"); +} diff --git a/gradlib/csrc/hipbsolgemm.cu b/gradlib/csrc/hipbsolgemm.cu new file mode 100644 index 0000000000000..bf15fb1297667 --- /dev/null +++ b/gradlib/csrc/hipbsolgemm.cu @@ -0,0 +1,610 @@ +// #ifdef __gfx908__ +// // Uncomment ifdef and endif only if you need to undef the HIP_HALF ops below just for gfx908 and not for others +// // below lines enable hip float to half conversion which are disabled by default in hip_fp16.h +// #undef __HIP_NO_HALF_OPERATORS__ +// #undef __HIP_NO_HALF_CONVERSIONS__ +// #endif + +#include +#include +#include +#include +#include +#include +#include +#include +// #include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include "nvToolsExt.h" + +//#include + + +// #ifdef USE_ROCM +// #define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR) +// #define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242) +// #endif + +// #ifdef __HIP_PLATFORM_HCC__ +// #define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR) +// #define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242) +// #if USE_GEMM_FLAGS_FP16_ALT_IMPL +// #ifdef ROCM_BACKWARD_PASS_GUARD +// flag = at::BackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0; +// #endif +// #endif +// #endif + +#ifndef CHECK_HIP_ERROR +#define CHECK_HIP_ERROR(error) \ + if(error != hipSuccess) \ + { \ + fprintf(stderr, \ + "Hip error: '%s'(%d) at %s:%d\n", \ + hipGetErrorString(error), \ + error, \ + __FILE__, \ + __LINE__); \ + exit(EXIT_FAILURE); \ + } +#endif + +#ifndef CHECK_HIPBLAS_ERROR +#define CHECK_HIPBLAS_ERROR(error) \ + if(error != HIPBLAS_STATUS_SUCCESS) \ + { \ + fprintf(stderr, \ + "hipBLAS error: '%s'(%d) at %s:%d\n", \ + hipblasStatusToString(error), \ + error, \ + __FILE__, \ + __LINE__); \ + exit(EXIT_FAILURE); \ + } +#endif + +namespace { + /*thread_local*/ cudaStream_t weight_stream; + // BUG: DLM has event and stream on different devices error + // In multi-GPU scenerio, do names defined in this namespace exist on all devices? + // C++ keyword: thread_local <- maybe this can help? + /*thread_local*/ cudaEvent_t event; + + // hipBLASLt + hipblasLtHandle_t hipblaslt_handle; + hipblasLtMatmulPreference_t preference; + size_t workspace_size = 2*128*1024*1024; + //uint64_t workspace_size = 0; + void* d_workspace; + int request_solutions = 1; + int returnedAlgoCount = 0; + + struct MatMulConfig { + hipblasOperation_t op_A; + hipblasOperation_t op_B; + int M; + int N; + int K; + hipDataType dtype; + + friend auto operator<(const MatMulConfig& left, const MatMulConfig& right) -> bool { + return std::tie(left.op_A, left.op_B, left.M, left.N, left.K, left.dtype) < std::tie(right.op_A, right.op_B, right.M, right.N, right.K, right.dtype); + } + }; + + // std::map, std::vector> heuristic_map; + std::map heuristic_map; + + hipEvent_t start, stop; + int bench_iters { 1 }; + int warmup_iters { 1 }; + + bool cout_print = false; + + //std::vector heuristicResult; +} + +//find all hipblaslt solutions for given gemm problem +std::vector hipblasLtMatmul_findallsols_wrapper( + hipblasLtHandle_t handle, + hipblasOperation_t op_A, + hipblasOperation_t op_B, + int m, int n, int k, + const void *alpha, + const void *a, + int lda, + const void *b, + int ldb, + const void *beta, + void *c, + int ldc, + hipDataType dtype, + hipStream_t &stream) +{ + int flag { 0 }; + hipblasLtMatrixLayout_t matA, matB, matC; + hipblasLtMatmulDesc_t matmul; + if (op_A == HIPBLAS_OP_N) { + CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, m, k, lda)); + } else { + CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, k, m, lda)); + } + if (op_B == HIPBLAS_OP_N) { + CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, k, n, ldb)); + } else { + CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, n, k, ldb)); + } + CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matC, dtype, m, n, ldc)); + CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescCreate(&matmul, HIPBLAS_COMPUTE_32F, HIP_R_32F)); + CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute( + matmul, HIPBLASLT_MATMUL_DESC_TRANSA, &op_A, sizeof(int32_t))); + CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute( + matmul, HIPBLASLT_MATMUL_DESC_TRANSB, &op_B, sizeof(int32_t))); + + //std::vector heuristicResult(10); + //CHECK_HIPBLAS_ERROR(hipblasLtMatmulAlgoGetHeuristic( + // handle, matmul, matA, matB, matC, matC, + // preference, 10, heuristicResult.data(), &returnedAlgoCount)); + std::vector heuristicResult; + CHECK_HIPBLAS_ERROR(hipblaslt_ext::getAllAlgos(handle, hipblaslt_ext::GemmType::HIPBLASLT_GEMM, + op_A, + op_B, + dtype, + dtype, + dtype, + dtype, + HIPBLAS_COMPUTE_32F, + heuristicResult)); + + std::vector algoIndex; + int returned_algo_count = heuristicResult.size(); + //for (int i = 0; i < returnedAlgoCount; i++) { + for (int i = 0; i < returned_algo_count; i++) { + auto algo = heuristicResult[i].algo; + size_t ret_workspace_size = 0; + auto status = hipblaslt_ext::matmulIsAlgoSupported(handle, matmul, + alpha, + matA, + matB, + beta, + matC, + matC, + algo, + ret_workspace_size + ); + if (status == HIPBLAS_STATUS_SUCCESS) { + if (ret_workspace_size heuristicResult(1); + if (solution_index<0) { + //nvtxRangePushA("hipblasLtMatmulAlgoGetHeuristic"); + std::cout << "Warning! HipbSolId Gemm Fallback Path used for solution index <0" << std::endl; + if (cout_print) { + std::cout << (op_A == HIPBLAS_OP_N ? "N" : "T") << (op_B == HIPBLAS_OP_N ? "N" : "T") + << " (" << m << ", " << n << ", " << k << "), dtype: " << dtype + << ", (lda, ldb, ldc): (" << lda << ", " << ldb << ", " << ldc << "), " << std::endl; + } + //std::vector heuristicResult(request_solutions); + CHECK_HIPBLAS_ERROR(hipblasLtMatmulAlgoGetHeuristic( + handle, matmul, matA, matB, matC, matC, + preference, request_solutions, heuristicResult.data(), &returnedAlgoCount)); + if((returnedAlgoCount != request_solutions) && cout_print) { + std::cout << "less solution found! request: " << request_solutions + << ", found: " << returnedAlgoCount << std::endl; + } + //heuristic_map[gemm_key] = heuristicResult[0]; +/* + if (returnedAlgoCount == 1) { + heuristic_map[gemm_key] = heuristicResult[0]; + } else { + // benchmark requested solutions and pick best one + int bestIndex { -1 }; + double bestMs { std::numeric_limits::max() }; + for (int sol { 0 }; sol < returnedAlgoCount; ++sol) { + // warm up + for (int iter { 0 }; iter < warmup_iters; ++iter) { + CHECK_HIPBLAS_ERROR(hipblasLtMatmul(handle, matmul, + alpha, + a, matA, + b, matB, + beta, + c, matC, + c, matC, // In case beta != 0, these runs can overwrite the values in c + // since c and d are the same + // TODO: allocates separate d memory for these runs + &heuristicResult[sol].algo, + d_workspace, workspace_size, + stream)); + } + // performance measuring + double eventMs; + CHECK_HIP_ERROR(hipEventRecord(start, stream)); + for (int iter { 0 }; iter < bench_iters; ++iter) { + CHECK_HIPBLAS_ERROR(hipblasLtMatmul(handle, matmul, + alpha, + a, matA, + b, matB, + beta, + c, matC, + c, matC, // In case beta != 0, these runs can overwrite the values in c + // since c and d are the same + // TODO: allocates separate d memory for these runs + &heuristicResult[sol].algo, + d_workspace, workspace_size, + stream)); + } + CHECK_HIP_ERROR(hipEventRecord(stop, stream)); + CHECK_HIP_ERROR(hipEventSynchronize(stop)); + float temp; + CHECK_HIP_ERROR(hipEventElapsedTime(&temp, start, stop)); + eventMs = double(temp); + eventMs /= bench_iters; + + if (cout_print) { + std::cout << " Sol " << sol << ": average time per iter " << std::to_string(eventMs) << " ms"; + } + if (bestMs > eventMs) { + bestMs = eventMs; + bestIndex = sol; + if (cout_print) { + std::cout << " *" << std::endl; + } + } else { + if (cout_print) { + std::cout << std::endl; + } + } + } + heuristic_map[gemm_key] = heuristicResult[bestIndex]; + } +*/ + //nvtxRangePop(); + } else { + std::vector algoIndex(1); + algoIndex[0]=solution_index; + //std::vector tmpAlgo; + CHECK_HIPBLAS_ERROR(hipblaslt_ext::getAlgosFromIndex(handle, algoIndex, heuristicResult)); + } + + //size_t ret_workspace_size = 0; + + //auto status1 = hipblaslt_ext::matmulIsAlgoSupported(handle, matmul, + // alpha, + // matA, + // matB, + // beta, + // matC, + // matC, + // heuristicResult[0].algo, + // ret_workspace_size + //); + //if (status1 == HIPBLAS_STATUS_SUCCESS) { + // std::cout << "Workspace size" << ret_workspace_size << std::endl; + + //} else { + // std::cout << "Algo not supported!!!" << std::endl; + + //} + hipblasStatus_t status = hipblasLtMatmul(handle, matmul, + alpha, + a, matA, + b, matB, + beta, + c, matC, + c, matC, + &heuristicResult[0].algo, + d_workspace, workspace_size, + stream); + + //nvtxRangePushA("hipBLASLt variables deletion"); + CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescDestroy(matmul)); + CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matA)); + CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matB)); + CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matC)); + //nvtxRangePop(); + + return status; +} +///////////////////////////////////////////////////////////////////////////////////////////////////////// +torch::Tensor HipbSolIdxBlas( + const torch::Tensor& mat1, + const torch::Tensor& mat2, + const int solution_index + ) +{ + auto mat1_strides { mat1.strides() }; + auto mat2_strides { mat2.strides() }; + auto mat1_sizes { mat1.sizes() }; + auto mat2_sizes { mat2.sizes() }; + // std::cout << " | mat1 info: size: " << mat1_sizes << " stride: " << mat1_strides << std::endl + // << " | mat2 info: size: " << mat2_sizes << " stride: " << mat2_strides << std::endl; + + TORCH_CHECK(mat1.dim() == 2 && mat2.dim() == 2, "tensors must be 2-D"); + TORCH_CHECK( + mat1.dtype() == mat2.dtype(), + "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype() + ); + TORCH_CHECK(mat1_sizes[1] == mat2_sizes[0], "mat1 dim 1 must match mat2 dim 0"); + + auto abcType { mat1.options().dtype() }; + auto options { at::TensorOptions().dtype(abcType).device(at::kCUDA) }; + auto result { torch::empty({ mat1_sizes[0], mat2_sizes[1] }, options) }; + // std::cout << " | result info: size: " << result.sizes() << " stride: " << result.strides() << std::endl; + + bool transpose_result = true; + bool transpose_mat1; + bool transpose_mat2; + if ((mat2_strides[0] == 1) && (mat2_strides[1] >= std::max(1, mat2_sizes[0]))) { + transpose_mat2 = false; + } else if ((mat2_strides[1] == 1) && (mat2_strides[0] >= std::max(1, mat2_sizes[1]))) { + transpose_mat2 = true; + } else { + assert(false && "unusual strides detected, may need to clone a contiguous tensor"); + } + if ((mat1_strides[0] == 1) && (mat1_strides[1] >= std::max(1, mat1_sizes[0]))) { + transpose_mat1 = false; + } else if ((mat1_strides[1] == 1) && (mat1_strides[0] >= std::max(1, mat1_sizes[1]))) { + transpose_mat1 = true; + } else { + assert(false && "unusual strides detected, may need to clone a contiguous tensor"); + } + + if (transpose_result) { + bool tmp = transpose_mat1; + transpose_mat1 = !transpose_mat2; + transpose_mat2 = !tmp; + mat1_strides = mat2.strides(); + mat2_strides = mat1.strides(); + mat1_sizes = mat2.sizes(); + mat2_sizes = mat1.sizes(); + } + // std::cout << " | transpose_result: " << (transpose_result ? "true" : "false") << std::endl + // << " | transpose_A: " << (transpose_mat1 ? "true" : "false") << std::endl + // << " | transpose_B: " << (transpose_mat2 ? "true" : "false") << std::endl; + // std::cout << " | A matrix: size: " << mat1_sizes << " stride: " << mat1_strides << std::endl + // << " | B matrix: size: " << mat2_sizes << " stride: " << mat2_strides << std::endl; + + float one { 1.0f }; + float zero { 0.0f }; + int64_t m = mat1_sizes[transpose_result ? 1 : 0]; + int64_t k = mat1_sizes[transpose_result ? 0 : 1]; + int64_t n = mat2_sizes[transpose_result ? 0 : 1]; + int64_t mat1_ld = mat1_strides[(transpose_mat1 == transpose_result) ? 1 : 0]; + int64_t mat2_ld = mat2_strides[(transpose_mat2 == transpose_result) ? 1 : 0]; + int64_t result_ld = result.stride(transpose_result ? 0 : 1); + // std::cout << " | (m, n, k): " << m << ", " << n << ", " << k << std::endl + // << " | (lda, ldb, ldc): " << mat1_ld << ", " << mat2_ld << ", " << result_ld << std::endl; + + hipDataType hipblasType; + if (abcType == at::kHalf) { + hipblasType = HIP_R_16F; + } else if (abcType == at::kBFloat16) { + hipblasType = HIP_R_16BF; + } else if (abcType == at::kFloat) { + hipblasType = HIP_R_32F; + } else { + assert(false && "Wrong datatype!"); + } + void *ptrA { static_cast((transpose_result ? mat2 : mat1).data_ptr()) }; + void *ptrB { static_cast((transpose_result ? mat1 : mat2).data_ptr()) }; + void *ptrC { static_cast(result.data_ptr()) }; + auto current_stream { torch::hip::getCurrentHIPStream().stream() }; + + CHECK_HIPBLAS_ERROR(hipblasLtMatmul_sol_wrapper( + hipblaslt_handle, + transpose_mat1 ? HIPBLAS_OP_T : HIPBLAS_OP_N, + transpose_mat2 ? HIPBLAS_OP_T : HIPBLAS_OP_N, + m, n, k, + &one, + ptrA, mat1_ld, + ptrB, mat2_ld, + &zero, + ptrC, result_ld, + hipblasType, + current_stream,solution_index)); + + return result; +} + +//find all hipblas solutions and return them to python land +std::vector HipbFindAllSolIdxBlas( + const torch::Tensor& mat1, + const torch::Tensor& mat2 + ) +{ + auto mat1_strides { mat1.strides() }; + auto mat2_strides { mat2.strides() }; + auto mat1_sizes { mat1.sizes() }; + auto mat2_sizes { mat2.sizes() }; + TORCH_CHECK(mat1.dim() == 2 && mat2.dim() == 2, "tensors must be 2-D"); + TORCH_CHECK( + mat1.dtype() == mat2.dtype(), + "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype() + ); + TORCH_CHECK(mat1_sizes[1] == mat2_sizes[0], "mat1 dim 1 must match mat2 dim 0"); + + auto abcType { mat1.options().dtype() }; + auto options { at::TensorOptions().dtype(abcType).device(at::kCUDA) }; + auto result { torch::empty({ mat1_sizes[0], mat2_sizes[1] }, options) }; + bool transpose_result = true; + bool transpose_mat1; + bool transpose_mat2; + if ((mat2_strides[0] == 1) && (mat2_strides[1] >= std::max(1, mat2_sizes[0]))) { + transpose_mat2 = false; + } else if ((mat2_strides[1] == 1) && (mat2_strides[0] >= std::max(1, mat2_sizes[1]))) { + transpose_mat2 = true; + } else { + assert(false && "unusual strides detected, may need to clone a contiguous tensor"); + } + if ((mat1_strides[0] == 1) && (mat1_strides[1] >= std::max(1, mat1_sizes[0]))) { + transpose_mat1 = false; + } else if ((mat1_strides[1] == 1) && (mat1_strides[0] >= std::max(1, mat1_sizes[1]))) { + transpose_mat1 = true; + } else { + assert(false && "unusual strides detected, may need to clone a contiguous tensor"); + } + if (transpose_result) { + bool tmp = transpose_mat1; + transpose_mat1 = !transpose_mat2; + transpose_mat2 = !tmp; + mat1_strides = mat2.strides(); + mat2_strides = mat1.strides(); + mat1_sizes = mat2.sizes(); + mat2_sizes = mat1.sizes(); + } + float one { 1.0f }; + float zero { 0.0f }; + int64_t m = mat1_sizes[transpose_result ? 1 : 0]; + int64_t k = mat1_sizes[transpose_result ? 0 : 1]; + int64_t n = mat2_sizes[transpose_result ? 0 : 1]; + int64_t mat1_ld = mat1_strides[(transpose_mat1 == transpose_result) ? 1 : 0]; + int64_t mat2_ld = mat2_strides[(transpose_mat2 == transpose_result) ? 1 : 0]; + int64_t result_ld = result.stride(transpose_result ? 0 : 1); + hipDataType hipblasType; + if (abcType == at::kHalf) { + hipblasType = HIP_R_16F; + } else if (abcType == at::kBFloat16) { + hipblasType = HIP_R_16BF; + } else if (abcType == at::kFloat) { + hipblasType = HIP_R_32F; + } else { + assert(false && "Wrong datatype!"); + } + void *ptrA { static_cast((transpose_result ? mat2 : mat1).data_ptr()) }; + void *ptrB { static_cast((transpose_result ? mat1 : mat2).data_ptr()) }; + void *ptrC { static_cast(result.data_ptr()) }; + auto current_stream { torch::hip::getCurrentHIPStream().stream() }; + + return hipblasLtMatmul_findallsols_wrapper( + hipblaslt_handle, + transpose_mat1 ? HIPBLAS_OP_T : HIPBLAS_OP_N, + transpose_mat2 ? HIPBLAS_OP_T : HIPBLAS_OP_N, + m, n, k, + &one, + ptrA, mat1_ld, + ptrB, mat2_ld, + &zero, + ptrC, result_ld, + hipblasType, + current_stream); + +} +///////////////////////////////////////////////////////////////////////////////////////////////////////// + +void hipb_create_extension() +{ + //CHECK_HIP_ERROR(hipStreamCreate(&weight_stream)); + //CHECK_HIP_ERROR(hipEventCreateWithFlags(&event, cudaEventDisableTiming)); + + // hipBLASLt + CHECK_HIPBLAS_ERROR(hipblasLtCreate(&hipblaslt_handle)); + CHECK_HIP_ERROR(hipMalloc(&d_workspace, workspace_size)); + CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceCreate(&preference)); + CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceSetAttribute( + preference, HIPBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_size, sizeof(workspace_size))); + + //CHECK_HIP_ERROR(hipEventCreate(&start)); + //CHECK_HIP_ERROR(hipEventCreate(&stop)); +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////// + +void hipb_destroy_extension() +{ + //CHECK_HIP_ERROR(hipStreamDestroy(weight_stream)); + //CHECK_HIP_ERROR(hipEventDestroy(event)); + + // hipBLASLt + CHECK_HIPBLAS_ERROR(hipblasLtDestroy(hipblaslt_handle)); + CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceDestroy(preference)); + CHECK_HIP_ERROR(hipFree(d_workspace)); + + //CHECK_HIP_ERROR(hipEventDestroy(start)); + //CHECK_HIP_ERROR(hipEventDestroy(stop)); +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////// + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) +{ + m.def("hipb_create_extension", &hipb_create_extension, "create_extension"); + m.def("hipb_destroy_extension", &hipb_destroy_extension, "destroy_extension"); + m.def("hipb_mm", &HipbSolIdxBlas, "mm"); + m.def("hipb_findallsols", &HipbFindAllSolIdxBlas, "hipblas_find_all_sols"); +} diff --git a/gradlib/csrc/rocsolgemm.cu b/gradlib/csrc/rocsolgemm.cu new file mode 100644 index 0000000000000..d691fcac416a6 --- /dev/null +++ b/gradlib/csrc/rocsolgemm.cu @@ -0,0 +1,563 @@ +// #ifdef __gfx908__ +// // Uncomment ifdef and endif only if you need to undef the HIP_HALF ops below just for gfx908 and not for others +// // below lines enable hip float to half conversion which are disabled by default in hip_fp16.h +// #undef __HIP_NO_HALF_OPERATORS__ +// #undef __HIP_NO_HALF_CONVERSIONS__ +// #endif + +#define ROCBLAS_NO_DEPRECATED_WARNINGS +#define ROCBLAS_BETA_FEATURES_API + +#include +#include +#include +#include +#include +#include +#include +#include +// #include +#include +#include +#include +#include + +#include +//#include +#include + +#include +#include +#include +#include +#include +#include +#include "nvToolsExt.h" + +#include + + +// #ifdef USE_ROCM +// #define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR) +// #define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242) +// #endif + +// #ifdef __HIP_PLATFORM_HCC__ +// #define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR) +// #define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242) +// #if USE_GEMM_FLAGS_FP16_ALT_IMPL +// #ifdef ROCM_BACKWARD_PASS_GUARD +// flag = at::BackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0; +// #endif +// #endif +// #endif + +#ifndef CHECK_HIP_ERROR +#define CHECK_HIP_ERROR(error) \ + if(error != hipSuccess) \ + { \ + fprintf(stderr, \ + "Hip error: '%s'(%d) at %s:%d\n", \ + hipGetErrorString(error), \ + error, \ + __FILE__, \ + __LINE__); \ + exit(EXIT_FAILURE); \ + } +#endif + +#ifndef CHECK_HIPBLAS_ERROR +#define CHECK_HIPBLAS_ERROR(error) \ + if(error != HIPBLAS_STATUS_SUCCESS) \ + { \ + fprintf(stderr, \ + "hipBLAS error: '%s'(%d) at %s:%d\n", \ + hipblasStatusToString(error), \ + error, \ + __FILE__, \ + __LINE__); \ + exit(EXIT_FAILURE); \ + } +#endif + +namespace { + rocblas_handle r_handle; + + /*thread_local*/ cudaStream_t weight_stream; + // BUG: DLM has event and stream on different devices error + // In multi-GPU scenerio, do names defined in this namespace exist on all devices? + // C++ keyword: thread_local <- maybe this can help? + /*thread_local*/ cudaEvent_t event; + + // hipBLASLt + hipblasLtHandle_t hipblaslt_handle; + hipblasLtMatmulPreference_t preference; + uint64_t workspace_size = 32*1024*1024; + //uint64_t workspace_size = 0; + void* d_workspace; + int request_solutions = 1; + int returnedAlgoCount = 0; + + struct MatMulConfig { + hipblasOperation_t op_A; + hipblasOperation_t op_B; + int M; + int N; + int K; + hipblasDatatype_t dtype; + + friend auto operator<(const MatMulConfig& left, const MatMulConfig& right) -> bool { + return std::tie(left.op_A, left.op_B, left.M, left.N, left.K, left.dtype) < std::tie(right.op_A, right.op_B, right.M, right.N, right.K, right.dtype); + } + }; + + // std::map, std::vector> heuristic_map; + std::map heuristic_map; + + hipEvent_t start, stop; + int bench_iters { 1 }; + int warmup_iters { 1 }; + + bool cout_print = true; +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////// +/** + * hipBLASLt GEMM call +*/ +/* +hipblasStatus_t hipblasLtMatmul_wrapper( + hipblasLtHandle_t handle, + hipblasOperation_t op_A, + hipblasOperation_t op_B, + int m, int n, int k, + const void *alpha, + const void *a, + int lda, + const void *b, + int ldb, + const void *beta, + void *c, + int ldc, + hipblasDatatype_t dtype, + hipStream_t &stream) +{ + // TODO: flag is not supported for hipblasLt yet + int flag { 0 }; + if (dtype == HIPBLAS_R_16F) { + // use fp16 alt impl for MI200 + // https://pytorch.org/docs/stable/notes/numerical_accuracy.html#reduced-precision-fp16-and-bf16-gemms-and-convolutions-on-amd-instinct-mi200-devices + flag = rocblas_gemm_flags_fp16_alt_impl; + } + + nvtxRangePushA("hipBLASLt variables creation"); + hipblasLtMatrixLayout_t matA, matB, matC; + hipblasLtMatmulDesc_t matmul; + if (op_A == HIPBLAS_OP_N) { + CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, m, k, lda)); + } else { + CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, k, m, lda)); + } + if (op_B == HIPBLAS_OP_N) { + CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, k, n, ldb)); + } else { + CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, n, k, ldb)); + } + CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matC, dtype, m, n, ldc)); + CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescCreate(&matmul, HIPBLASLT_COMPUTE_F32, HIPBLAS_R_32F)); + CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute( + matmul, HIPBLASLT_MATMUL_DESC_TRANSA, &op_A, sizeof(int32_t))); + CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute( + matmul, HIPBLASLT_MATMUL_DESC_TRANSB, &op_B, sizeof(int32_t))); + nvtxRangePop(); + + // if heuristic does not exist in the map, do search and push into the map + auto gemm_key { MatMulConfig { op_A, op_B, m, n, k, dtype } }; + if (heuristic_map.count(gemm_key) <= 0) { + nvtxRangePushA("hipblasLtMatmulAlgoGetHeuristic"); + if (cout_print) { + std::cout << (op_A == HIPBLAS_OP_N ? "N" : "T") << (op_B == HIPBLAS_OP_N ? "N" : "T") + << " (" << m << ", " << n << ", " << k << "), dtype: " << dtype + << ", (lda, ldb, ldc): (" << lda << ", " << ldb << ", " << ldc << "), " << std::endl; + } + std::vector heuristicResult(request_solutions); + CHECK_HIPBLAS_ERROR(hipblasLtMatmulAlgoGetHeuristic( + handle, matmul, matA, matB, matC, matC, + preference, request_solutions, heuristicResult.data(), &returnedAlgoCount)); + if((returnedAlgoCount != request_solutions) && cout_print) { + std::cout << "less solution found! request: " << request_solutions + << ", found: " << returnedAlgoCount << std::endl; + } + + if (returnedAlgoCount == 1) { + heuristic_map[gemm_key] = heuristicResult[0]; + } else { + // benchmark requested solutions and pick best one + int bestIndex { -1 }; + double bestMs { std::numeric_limits::max() }; + for (int sol { 0 }; sol < returnedAlgoCount; ++sol) { + // warm up + for (int iter { 0 }; iter < warmup_iters; ++iter) { + CHECK_HIPBLAS_ERROR(hipblasLtMatmul(handle, matmul, + alpha, + a, matA, + b, matB, + beta, + c, matC, + c, matC, // In case beta != 0, these runs can overwrite the values in c + // since c and d are the same + // TODO: allocates separate d memory for these runs + &heuristicResult[sol].algo, + d_workspace, workspace_size, + stream)); + } + // performance measuring + double eventMs; + CHECK_HIP_ERROR(hipEventRecord(start, stream)); + for (int iter { 0 }; iter < bench_iters; ++iter) { + CHECK_HIPBLAS_ERROR(hipblasLtMatmul(handle, matmul, + alpha, + a, matA, + b, matB, + beta, + c, matC, + c, matC, // In case beta != 0, these runs can overwrite the values in c + // since c and d are the same + // TODO: allocates separate d memory for these runs + &heuristicResult[sol].algo, + d_workspace, workspace_size, + stream)); + } + CHECK_HIP_ERROR(hipEventRecord(stop, stream)); + CHECK_HIP_ERROR(hipEventSynchronize(stop)); + float temp; + CHECK_HIP_ERROR(hipEventElapsedTime(&temp, start, stop)); + eventMs = double(temp); + eventMs /= bench_iters; + + if (cout_print) { + std::cout << " Sol " << sol << ": average time per iter " << std::to_string(eventMs) << " ms"; + } + if (bestMs > eventMs) { + bestMs = eventMs; + bestIndex = sol; + if (cout_print) { + std::cout << " *" << std::endl; + } + } else { + if (cout_print) { + std::cout << std::endl; + } + } + } + heuristic_map[gemm_key] = heuristicResult[bestIndex]; + } + nvtxRangePop(); + } + + hipblasStatus_t status = hipblasLtMatmul(handle, matmul, + alpha, + a, matA, + b, matB, + beta, + c, matC, + c, matC, + &heuristic_map[gemm_key].algo, + d_workspace, workspace_size, + stream); + + nvtxRangePushA("hipBLASLt variables deletion"); + CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescDestroy(matmul)); + CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matA)); + CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matB)); + CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matC)); + nvtxRangePop(); + + return status; +} +*/ +///////////////////////////////////////////////////////////////////////////////////////////////////////// +std::vector RocFindAllSolIdxBlas( + const torch::Tensor& mat1, + const torch::Tensor& mat2 + ) +{ + auto mat1_strides { mat1.strides() }; + auto mat2_strides { mat2.strides() }; + auto mat1_sizes { mat1.sizes() }; + auto mat2_sizes { mat2.sizes() }; + + TORCH_CHECK(mat1.dim() == 2 && mat2.dim() == 2, "tensors must be 2-D"); + TORCH_CHECK( + mat1.dtype() == mat2.dtype(), + "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype() + ); + TORCH_CHECK(mat1_sizes[1] == mat2_sizes[0], "mat1 dim 1 must match mat2 dim 0"); + + auto abcType { mat1.options().dtype() }; + auto options { at::TensorOptions().dtype(abcType).device(at::kCUDA) }; + auto result { torch::empty({ mat1_sizes[0], mat2_sizes[1] }, options) }; + + bool transpose_result = true; + bool transpose_mat1; + bool transpose_mat2; + if ((mat2_strides[0] == 1) && (mat2_strides[1] >= std::max(1, mat2_sizes[0]))) { + transpose_mat2 = false; + } else if ((mat2_strides[1] == 1) && (mat2_strides[0] >= std::max(1, mat2_sizes[1]))) { + transpose_mat2 = true; + } else { + assert(false && "unusual strides detected, may need to clone a contiguous tensor"); + } + if ((mat1_strides[0] == 1) && (mat1_strides[1] >= std::max(1, mat1_sizes[0]))) { + transpose_mat1 = false; + } else if ((mat1_strides[1] == 1) && (mat1_strides[0] >= std::max(1, mat1_sizes[1]))) { + transpose_mat1 = true; + } else { + assert(false && "unusual strides detected, may need to clone a contiguous tensor"); + } + if (transpose_result) { + bool tmp = transpose_mat1; + transpose_mat1 = !transpose_mat2; + transpose_mat2 = !tmp; + mat1_strides = mat2.strides(); + mat2_strides = mat1.strides(); + mat1_sizes = mat2.sizes(); + mat2_sizes = mat1.sizes(); + } + float one { 1.0f }; + float zero { 0.0f }; + int64_t m = mat1_sizes[transpose_result ? 1 : 0]; + int64_t k = mat1_sizes[transpose_result ? 0 : 1]; + int64_t n = mat2_sizes[transpose_result ? 0 : 1]; + int64_t mat1_ld = mat1_strides[(transpose_mat1 == transpose_result) ? 1 : 0]; + int64_t mat2_ld = mat2_strides[(transpose_mat2 == transpose_result) ? 1 : 0]; + int64_t result_ld = result.stride(transpose_result ? 0 : 1); + + void *ptrA { static_cast((transpose_result ? mat2 : mat1).data_ptr()) }; + void *ptrB { static_cast((transpose_result ? mat1 : mat2).data_ptr()) }; + void *ptrC { static_cast(result.data_ptr()) }; + auto current_stream { torch::hip::getCurrentHIPStream().stream() }; + + rocblas_set_stream(r_handle, current_stream); + uint32_t flags { 0 }; + rocblas_datatype abcRtype; + if (abcType == at::kHalf) { + abcRtype = rocblas_datatype_f16_r; + } else if (abcType == at::kBFloat16) { + abcRtype = rocblas_datatype_bf16_r; + } else if (abcType == at::kFloat) { + abcRtype = rocblas_datatype_f32_r; + } else { + assert(false && "Wrong datatype!"); + } + + #define GEMM_EX_ARGS \ + r_handle, transpose_mat1 ? rocblas_operation_transpose : rocblas_operation_none, transpose_mat2 ? rocblas_operation_transpose : rocblas_operation_none, \ + m, n, k, &one, ptrA, abcRtype, mat1_ld, ptrB, abcRtype, mat2_ld, &zero, ptrC, \ + abcRtype, result_ld, ptrC, abcRtype, result_ld, rocblas_datatype_f32_r, rocblas_gemm_algo_solution_index + + rocblas_int sizeSolve; + //CHECK_ROCBLAS_ERROR( + rocblas_gemm_ex_get_solutions(GEMM_EX_ARGS, rocblas_gemm_flags_none, NULL, &sizeSolve); + + // Fill array with list of solutions that match type + // Note: some of these may be invalid + std::vector solutionsSolve(sizeSolve); + //CHECK_ROCBLAS_ERROR( + rocblas_gemm_ex_get_solutions(GEMM_EX_ARGS, rocblas_gemm_flags_none, solutionsSolve.data(), &sizeSolve); + + std::vector validSolutions; + for(auto sol : solutionsSolve) { + auto status = rocblas_gemm_ex(r_handle, + transpose_mat1 ? rocblas_operation_transpose : rocblas_operation_none, + transpose_mat2 ? rocblas_operation_transpose : rocblas_operation_none, + m, n, k, + &one, ptrA, abcRtype, mat1_ld, ptrB, abcRtype, mat2_ld, + &zero, ptrC, abcRtype, result_ld, + ptrC, abcRtype, result_ld, + rocblas_datatype_f32_r, rocblas_gemm_algo_solution_index, sol, rocblas_gemm_flags_none); + if (status == rocblas_status_success) { + validSolutions.push_back(sol); + } + } + + return validSolutions; +} +///////////////////////////////////////////////////////////////////////////////////////////////////////// +torch::Tensor RocSolIdxBlas( + const torch::Tensor& mat1, + const torch::Tensor& mat2, + const int32_t solution_index=0 + ) +{ + auto mat1_strides { mat1.strides() }; + auto mat2_strides { mat2.strides() }; + auto mat1_sizes { mat1.sizes() }; + auto mat2_sizes { mat2.sizes() }; + // std::cout << " | mat1 info: size: " << mat1_sizes << " stride: " << mat1_strides << std::endl + // << " | mat2 info: size: " << mat2_sizes << " stride: " << mat2_strides << std::endl; + + TORCH_CHECK(mat1.dim() == 2 && mat2.dim() == 2, "tensors must be 2-D"); + TORCH_CHECK( + mat1.dtype() == mat2.dtype(), + "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype() + ); + TORCH_CHECK(mat1_sizes[1] == mat2_sizes[0], "mat1 dim 1 must match mat2 dim 0"); + + auto abcType { mat1.options().dtype() }; + auto options { at::TensorOptions().dtype(abcType).device(at::kCUDA) }; + auto result { torch::empty({ mat1_sizes[0], mat2_sizes[1] }, options) }; + // std::cout << " | result info: size: " << result.sizes() << " stride: " << result.strides() << std::endl; + + bool transpose_result = true; + bool transpose_mat1; + bool transpose_mat2; + if ((mat2_strides[0] == 1) && (mat2_strides[1] >= std::max(1, mat2_sizes[0]))) { + transpose_mat2 = false; + } else if ((mat2_strides[1] == 1) && (mat2_strides[0] >= std::max(1, mat2_sizes[1]))) { + transpose_mat2 = true; + } else { + assert(false && "unusual strides detected, may need to clone a contiguous tensor"); + } + if ((mat1_strides[0] == 1) && (mat1_strides[1] >= std::max(1, mat1_sizes[0]))) { + transpose_mat1 = false; + } else if ((mat1_strides[1] == 1) && (mat1_strides[0] >= std::max(1, mat1_sizes[1]))) { + transpose_mat1 = true; + } else { + assert(false && "unusual strides detected, may need to clone a contiguous tensor"); + } + + if (transpose_result) { + bool tmp = transpose_mat1; + transpose_mat1 = !transpose_mat2; + transpose_mat2 = !tmp; + mat1_strides = mat2.strides(); + mat2_strides = mat1.strides(); + mat1_sizes = mat2.sizes(); + mat2_sizes = mat1.sizes(); + } + // std::cout << " | transpose_result: " << (transpose_result ? "true" : "false") << std::endl + // << " | transpose_A: " << (transpose_mat1 ? "true" : "false") << std::endl + // << " | transpose_B: " << (transpose_mat2 ? "true" : "false") << std::endl; + // std::cout << " | A matrix: size: " << mat1_sizes << " stride: " << mat1_strides << std::endl + // << " | B matrix: size: " << mat2_sizes << " stride: " << mat2_strides << std::endl; + + float one { 1.0f }; + float zero { 0.0f }; + int64_t m = mat1_sizes[transpose_result ? 1 : 0]; + int64_t k = mat1_sizes[transpose_result ? 0 : 1]; + int64_t n = mat2_sizes[transpose_result ? 0 : 1]; + int64_t mat1_ld = mat1_strides[(transpose_mat1 == transpose_result) ? 1 : 0]; + int64_t mat2_ld = mat2_strides[(transpose_mat2 == transpose_result) ? 1 : 0]; + int64_t result_ld = result.stride(transpose_result ? 0 : 1); + // std::cout << " | (m, n, k): " << m << ", " << n << ", " << k << std::endl + // << " | (lda, ldb, ldc): " << mat1_ld << ", " << mat2_ld << ", " << result_ld << std::endl; + + /* + int flag { 0 }; + hipblasDatatype_t hipblasType; + if (abcType == at::kHalf) { + hipblasType = HIPBLAS_R_16F; + } else if (abcType == at::kBFloat16) { + hipblasType = HIPBLAS_R_16B; + } else if (abcType == at::kFloat) { + hipblasType = HIPBLAS_R_32F; + } else { + assert(false && "Wrong datatype!"); + } + */ + void *ptrA { static_cast((transpose_result ? mat2 : mat1).data_ptr()) }; + void *ptrB { static_cast((transpose_result ? mat1 : mat2).data_ptr()) }; + void *ptrC { static_cast(result.data_ptr()) }; + auto current_stream { torch::hip::getCurrentHIPStream().stream() }; + /* + + CHECK_HIPBLAS_ERROR(hipblasLtMatmul_wrapper( + hipblaslt_handle, + transpose_mat1 ? HIPBLAS_OP_T : HIPBLAS_OP_N, + transpose_mat2 ? HIPBLAS_OP_T : HIPBLAS_OP_N, + m, n, k, + &one, + ptrA, mat1_ld, + ptrB, mat2_ld, + &zero, + ptrC, result_ld, + hipblasType, + current_stream)); + */ + rocblas_set_stream(r_handle, current_stream); + uint32_t flags { 0 }; + //int32_t solution_index {0}; + rocblas_datatype abcRtype; + if (abcType == at::kHalf) { + abcRtype = rocblas_datatype_f16_r; + } else if (abcType == at::kBFloat16) { + abcRtype = rocblas_datatype_bf16_r; + } else if (abcType == at::kFloat) { + abcRtype = rocblas_datatype_f32_r; + } else { + assert(false && "Wrong datatype!"); + } + + //CHECK_ROCBLAS_ERROR( + rocblas_gemm_ex(r_handle, + transpose_mat1 ? rocblas_operation_transpose : rocblas_operation_none, + transpose_mat2 ? rocblas_operation_transpose : rocblas_operation_none, + m, n, k, + &one, ptrA, abcRtype, mat1_ld, ptrB, abcRtype, mat2_ld, + &zero, ptrC, abcRtype, result_ld, + ptrC, abcRtype, result_ld, + rocblas_datatype_f32_r, rocblas_gemm_algo_solution_index, solution_index, flags); + //); + + + return result; +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////// + +void rocb_create_extension() +{ + /* + CHECK_HIP_ERROR(hipStreamCreate(&weight_stream)); + CHECK_HIP_ERROR(hipEventCreateWithFlags(&event, cudaEventDisableTiming)); + + // hipBLASLt + CHECK_HIPBLAS_ERROR(hipblasLtCreate(&hipblaslt_handle)); + CHECK_HIP_ERROR(hipMalloc(&d_workspace, workspace_size)); + CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceCreate(&preference)); + CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceSetAttribute( + preference, HIPBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_size, sizeof(workspace_size))); + + CHECK_HIP_ERROR(hipEventCreate(&start)); + CHECK_HIP_ERROR(hipEventCreate(&stop)); */ + rocblas_create_handle(&r_handle); +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////// + +void rocb_destroy_extension() +{ + /* + CHECK_HIP_ERROR(hipStreamDestroy(weight_stream)); + CHECK_HIP_ERROR(hipEventDestroy(event)); + + // hipBLASLt + CHECK_HIPBLAS_ERROR(hipblasLtDestroy(hipblaslt_handle)); + CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceDestroy(preference)); + CHECK_HIP_ERROR(hipFree(d_workspace)); + + CHECK_HIP_ERROR(hipEventDestroy(start)); + CHECK_HIP_ERROR(hipEventDestroy(stop)); */ + rocblas_destroy_handle(r_handle); +} + +///////////////////////////////////////////////////////////////////////////////////////////////////////// + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) +{ + m.def("rocb_create_extension", &rocb_create_extension, "create_extension"); + m.def("rocb_destroy_extension", &rocb_destroy_extension, "destroy_extension"); + m.def("rocb_mm", &RocSolIdxBlas, "mm"); + m.def("rocb_findallsols", &RocFindAllSolIdxBlas, "rocblas_find_all_sols"); +} diff --git a/gradlib/gemm_runner.py b/gradlib/gemm_runner.py new file mode 100644 index 0000000000000..34a246771a820 --- /dev/null +++ b/gradlib/gemm_runner.py @@ -0,0 +1,62 @@ +import torch +import rocsolidxgemm +import hipbsolidxgemm +import numpy as np +import torch.nn.functional as F +import sys +import pandas as pd +import timeit + +rocsolidxgemm.rocb_create_extension() +hipbsolidxgemm.hipb_create_extension() + +class TunedGemm: + def __init__(self,tuned_csv_file): + self.bestsols = pd.read_csv(tuned_csv_file,index_col=[0]) + self.create_ds() + def create_ds(self): + df = self.bestsols + solds = {} + for i in range(len(df)): + ds = df.iloc[i] + key = (ds['M'],ds['N'],ds['K']) + if ds['libtype']=='hipblaslt': soltype = 1 + elif ds['libtype']=='rocblas': soltype = 2 + solds[key] = (soltype,int(ds['solidx'])) + #print(solds) + self.solids = solds + def query_sol(self,m,n,k): + return self.solids.get((m,n,k),(0,0)) + def mm(self,inp,weights): + soltype,solidx = self.query_sol(m=weights.shape[0],n=inp.shape[0],k=inp.shape[1]) + if soltype==1: + out = hipbsolidxgemm.hipb_mm(inp,weights.t(),solidx) + elif soltype==2: + out = rocsolidxgemm.rocb_mm(inp,weights.t(),solidx) + else: + out = F.linear(inp,weights) + return out + def run_all_tuned_sols(self): + for i in range(len(self.bestsols)): + ds = self.bestsols.iloc[i] + print('>>> Running tuned solution') + print(ds) + inp = torch.randn((ds['N'], ds['K']), dtype=get_dtype(ds['dtype']), device='cuda') + weights = torch.randn((ds['M'], ds['K']), dtype=get_dtype(ds['dtype']), device='cuda') + self.mm(inp,weights) + +def get_dtype(dtype_csv): + if dtype_csv=='torch.float16': + dtype = torch.float16 + elif dtype_csv=='torch.bfloat16': + dtype = torch.bfloat16 + elif dtype_csv=='torch.float32': + dtype = torch.float32 + return dtype + +if __name__ == '__main__': + tgemm = TunedGemm(sys.argv[1]) #csv file with tuned sols goes in argv[1] + print(tgemm.bestsols) + tgemm.run_all_tuned_sols() + + diff --git a/gradlib/gemm_tuner.py b/gradlib/gemm_tuner.py new file mode 100644 index 0000000000000..b6c69379cf6c6 --- /dev/null +++ b/gradlib/gemm_tuner.py @@ -0,0 +1,92 @@ +import torch +import os +import argparse +from gradlib.GemmTuner import GemmTuner +import rocsolidxgemm +import hipbsolidxgemm +import numpy as np +import torch.nn.functional as F +import sys +import pandas as pd +import json +import random +from pathlib import Path +rocsolidxgemm.rocb_create_extension() +hipbsolidxgemm.hipb_create_extension() + +''' +{'architectures': ['LlamaForCausalLM'], 'bos_token_id': 1, 'eos_token_id': 2, 'hidden_act': 'silu', 'hidden_size': 5120, 'initializer_range': 0.02, +'intermediate_size': 13824, 'max_position_embeddings': 2048, 'model_type': 'llama', 'num_attention_heads': 40, 'num_hidden_layers': 40, 'num_key_value_heads': 40, +'pretraining_tp': 1, 'rms_norm_eps': 1e-05, 'rope_scaling': None, 'tie_word_embeddings': False, 'torch_dtype': 'float16', 'transformers_version': '4.33.0.dev0', 'use_cache': True, 'vocab_size': 32000} +''' +def generate_mk_sets(model_dir, tp=1): + f = open(f'{model_dir}/config.json') + data = json.load(f) + hidden_size = data['hidden_size'] + intermediate_size = data['intermediate_size'] + total_num_heads = data['num_attention_heads'] + total_num_kv_heads = data['num_key_value_heads'] + head_dim = hidden_size // total_num_heads + return [((total_num_heads + (2*total_num_kv_heads)) * head_dim // tp, hidden_size), (hidden_size, hidden_size // tp), (intermediate_size *2 // tp, hidden_size), (hidden_size, intermediate_size // tp) ], hidden_size + +def get_dtype(dtype_str): + dtype = torch.float16 + if dtype_str == 'f32': + dtype = torch.float32 + elif dtype_str == 'bf16': + dtype = torch.bfloat16 + elif dtype_str == 'f16': + dtype = torch.float16 + else: + print('>>> Warning! Invalid dtype', dtype_str, 'using default dtype f16') + return dtype + + +def list_of_ints(arg): + return list(map(int, arg.split(','))) + +def load_input_gemms(input_file): + if Path(input_file).is_file(): + return + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--model_dir", type=str, default=os.getenv('GTUNE_MODEL', ""), help="Enter the location of your model directory") + parser.add_argument("--tuned_file", type=str, default=os.getenv('GTUNE_TUNED', "tuned.csv"), help="output file for tuned gemm solutions") + parser.add_argument("--input_file", type=str, default=os.getenv('GTUNE_INPUT', None), help="list of gemms to tune for, mutually exclusive with model_dir") + parser.add_argument("--tp", type=int, default=os.getenv('GTUNE_TP', 1), help="Tensor parallelism to be used.") + parser.add_argument("--dtype", type=str, default='f16', help="dtype f32 f16 bf16") + parser.add_argument("--rocblas-decode", action="store_true", default=False, help="forces rocblas solution on decode N=1") + parser.add_argument("--batch_size", type=int, default=os.getenv('GTUNE_BATCH_SIZE', 1), help="Batch size to tune for") + parser.add_argument("--nsets", type=list_of_ints, default=[1, 512, 1024, 2048, 3072, 4096, 8192, 16384], help="N sizes to tune for: 1,128,2048") + args = parser.parse_args() + + dtype = get_dtype(args.dtype) + + gtuner = GemmTuner(dtype, args.tuned_file, args.rocblas_decode) + nsets = [i * args.batch_size for i in args.nsets] + if args.input_file: + print(f">>> Loading {args.input_file}") + if not Path(args.input_file).is_file(): + print(f">>> ERROR: {args.input_file} does not exist. Exiting") + exit(1) + shapes = pd.read_csv(args.input_file) + for i in range(len(shapes)): + ds = shapes.iloc[i] + gtuner.add_gemm(ds['M'],ds['N'],ds['K']) + else: + if not args.model_dir: + print(">>> Warning! NO MODEL SPECIFIED. Tuning for LL2 13B TP1") + #LL2 13B sizes + mksets = [(15360, 5120), (5120, 5120), (27648, 5120), (5120, 13824)] + gtuner.add_gemm(m=32000, n=1, k=5120) # logits gemm + else: + mksets, hidden_size = generate_mk_sets(args.model_dir, args.tp) + gtuner.add_gemm(m=32000//args.tp, n=1 * args.batch_size, k=hidden_size) #TODO: Handle cases where vocab_size is not divisible by tp + + for n in sorted(nsets): + for m, k in mksets: + gtuner.add_gemm(m, n, k) + + gtuner.find_best_sols() diff --git a/gradlib/gradlib/GemmTuner.py b/gradlib/gradlib/GemmTuner.py new file mode 100644 index 0000000000000..273042cb12a05 --- /dev/null +++ b/gradlib/gradlib/GemmTuner.py @@ -0,0 +1,208 @@ +import torch +import os +import argparse +import rocsolidxgemm +import hipbsolidxgemm +import numpy as np +import torch.nn.functional as F +import sys +import pandas as pd +import json +import random +from pathlib import Path +rocsolidxgemm.rocb_create_extension() +hipbsolidxgemm.hipb_create_extension() + +rtol = 1e-5 +atol = 1 +dtype = torch.float16 + +class Gemm: + def __init__(self,m,n,k,dtype,rocblas_decode=False): + self.m=m + self.k=k + self.n=n + self.dtype=dtype + self.nb = 37 + self.inp = torch.randn((self.n, self.k), dtype=self.dtype, device='cuda') + self.weights = torch.randn((self.m, self.k), dtype=self.dtype, device='cuda') + #weights2 is used in measurement/warm iters to ensure HBM fetch for weight tensors + self.weights2 = torch.randn((self.nb, self.m, self.k), dtype=self.dtype, device='cuda') + self.blob = torch.ones(128*1024*1024, dtype=torch.float32, device='cuda') + self.topn = 20 #number of top solutions from each source + self.hipb_sols=[] + self.rtol = 1e-5 + self.atol = 1 + self.start = torch.cuda.Event(enable_timing=True) + self.end = torch.cuda.Event(enable_timing=True) + self.hipb_prefer_ratio = 0.995 #prefer hipblaslt unless rocblas time is less than this ratio of hipblaslt time + self.rocblas_decode=rocblas_decode + + + def find_hipblas_sols(self): + sols = hipbsolidxgemm.hipb_findallsols(self.inp,self.weights.t()) + print('M N K',self.m,self.n,self.k,'>>> Total hipb solutions',len(sols), flush=True) + #print(sols) + self.hipb_sols = sols + + + def check_gemm_ref(self,libtype,solidx): + ref = F.linear(self.inp,self.weights) + if libtype == 'hipblaslt': + c = hipbsolidxgemm.hipb_mm(self.inp,self.weights.t(),solidx) + elif libtype == 'rocblas': + c = rocsolidxgemm.rocb_mm(self.inp,self.weights.t(),solidx) + if torch.allclose(c, ref, atol=self.atol, rtol=self.rtol): + #print('>>>',libtype,'Solidx',solidx,'passed reference test') + return True + else: + print('>>>',libtype,'Solidx',solidx,'FAILED reference test', flush=True) + print(ref, flush=True) + print(c, flush=True) + return False + def hipb_time_sol(self,solidx,cold_iters=2,warm_iters=10): + #print('>>>hipbtime',solidx) + for i in range(cold_iters): + c = hipbsolidxgemm.hipb_mm(self.inp,self.weights.t(),solidx) + self.start.record() + for i in range(warm_iters): + c = hipbsolidxgemm.hipb_mm(self.inp,self.weights2 [random.randint(0,self.nb-1)].t(),solidx) + self.end.record() + torch.cuda.synchronize() + gtime = self.start.elapsed_time(self.end)/warm_iters + #print('>>> Solidx GTime',solidx,gtime,'ms') + return gtime + def hipb_time_all_sols(self,fast_mode=0,top_sols=0): + coldi=20; warmi=20 + if fast_mode: coldi=2; warmi=2 + solutions = self.hipb_sols + if top_sols: solutions = self.hipb_top_sols + gtimes = {} + for solidx in solutions: + gtimes[solidx] = self.hipb_time_sol(solidx, cold_iters=coldi, warm_iters=warmi) + self.hipb_gtimedf = pd.DataFrame.from_dict(gtimes,orient='index',columns=['gtimems']).sort_values(by='gtimems') + self.hipb_gtimedf.to_csv('/tmp/hipb_gtimedf.csv') + print('>>> HipBlasLt top solutions, Fast Mode',fast_mode) + print(self.hipb_gtimedf.head(self.topn)) + def rocb_time_sol(self, solidx, cold_iters=2, warm_iters=10): + for i in range(cold_iters): + c = rocsolidxgemm.rocb_mm(self.inp, self.weights.t(), solidx) + self.start.record() + for i in range(warm_iters): + c = rocsolidxgemm.rocb_mm(self.inp, self.weights2[random.randint(0, self.nb-1)].t(), solidx) + self.end.record() + torch.cuda.synchronize() + gtime = self.start.elapsed_time(self.end)/warm_iters + #print('>>> RocSolidx GTime',solidx,gtime,'ms') + return gtime + def find_rocblas_sols(self): + sols = rocsolidxgemm.rocb_findallsols(self.inp,self.weights.t()) + print('M N K',self.m,self.n,self.k,'>>> Total rocb solutions',len(sols), flush=True) + #print(sols) + self.rocb_sols = sols + def rocb_time_all_sols(self,fast_mode=0,top_sols=0): + coldi=20; warmi=20 + if fast_mode: coldi=2; warmi=2 + solutions = self.rocb_sols + if top_sols: solutions = self.rocb_top_sols + gtimes = {} + for solidx in solutions: + gtimes[solidx] = self.rocb_time_sol(solidx,coldi,warmi) + self.rocb_gtimedf = pd.DataFrame.from_dict(gtimes,orient='index',columns=['gtimems']).sort_values(by='gtimems') + self.rocb_gtimedf.to_csv('/tmp/rocb_gtimedf.csv') + print('>>> Rocblas top solutions, Fast Mode',fast_mode, flush=True) + print(self.rocb_gtimedf.head(self.topn), flush=True) + def warmup(self,warmi=500): + for i in range(warmi): + self.blob = self.blob + 0.00001 + def functional_check_topn_fastest(self): + rocb_topn = [] + for solidx in self.rocb_gtimedf.index[:self.topn]: + if self.check_gemm_ref(libtype='rocblas',solidx=solidx): + rocb_topn.append(solidx) + self.rocb_top_sols = rocb_topn + hipb_topn = [] + for solidx in self.hipb_gtimedf.index[:self.topn]: + if self.check_gemm_ref(libtype='hipblaslt',solidx=solidx): + hipb_topn.append(solidx) + self.hipb_top_sols = hipb_topn + + def find_fastest_solution(self): + self.find_rocblas_sols() + if not (self.rocblas_decode and self.n == 1): + self.find_hipblas_sols() + self.warmup() + self.rocb_time_all_sols(fast_mode=1) + self.warmup() + self.hipb_time_all_sols(fast_mode=1) + self.functional_check_topn_fastest() + self.warmup() + self.rocb_time_all_sols(fast_mode=0,top_sols=1) + self.warmup() + self.hipb_time_all_sols(fast_mode=0,top_sols=1) + if len(self.rocb_gtimedf)>0 and len(self.hipb_gtimedf)>0: + best_rocb_time = self.rocb_gtimedf.gtimems.iloc[0] + best_hipb_time = self.hipb_gtimedf.gtimems.iloc[0] + if best_rocb_time0: + print('>>> Only hipblas solutions found!',flush=True) + best_hipb_time = self.hipb_gtimedf.gtimems.iloc[0] + self.best_libtype = 'hipblaslt' + self.best_solidx = self.hipb_gtimedf.index[0] + self.best_soltime = best_hipb_time + elif len(self.rocb_gtimedf)>0: + print('>>> Only rocblas solutions found!',flush=True) + best_rocb_time = self.rocb_gtimedf.gtimems.iloc[0] + self.best_libtype = 'rocblas' + self.best_solidx = self.rocb_gtimedf.index[0] + self.best_soltime = best_rocb_time + else: + print('>>> No rocblas or hipblas solutions found!',flush=True) + self.best_libtype = 'rocblas' + self.best_solidx = 0 + self.best_soltime = 0 + print('>>> Fastest Solution is',self.best_libtype,self.best_solidx,self.best_soltime,flush=True) + + +class GemmTuner: + def __init__(self, dtype, tuned_file=None, rocblas_decode=False): + self.gemm_problems = pd.DataFrame(columns=['M','N','K']) + self.dtype = dtype + self.rocblas_decode = rocblas_decode + self.tuned_file = tuned_file + if Path(tuned_file).is_file(): + self.gdf = pd.read_csv(tuned_file) + else: + self.gdf = None + + def add_gemm(self,m,n,k): + if ( self.gdf is None or (self.gdf[(self.gdf['M'] == m) & (self.gdf['N'] == n) & (self.gdf['K'] == k)].empty)): + entry = {'M':[m], 'N':[n], 'K':[k]} + df = pd.DataFrame(entry) + self.gemm_problems = pd.concat([self.gemm_problems, df],ignore_index=True) + else: + print(f">>>Info: Found Duplicate shape(M:{m}, N:{n}, K:{k}), skipping") + + def find_best_sols(self): + df = self.gemm_problems + soldf = pd.DataFrame() + for i in range(len(df)): + ds = df.iloc[i] + gemmobj = Gemm(ds['M'],ds['N'],ds['K'],dtype=self.dtype, rocblas_decode=self.rocblas_decode) + gemmobj.find_fastest_solution() + soldf.loc[i,'libtype'] = gemmobj.best_libtype + soldf.loc[i,'solidx'] = gemmobj.best_solidx + soldf.loc[i,'soltimems'] = gemmobj.best_soltime + soldf['dtype'] = self.dtype + finaldf = pd.concat([self.gemm_problems, soldf],axis=1) + finaldf = pd.concat([finaldf, self.gdf]) + finaldf.to_csv(self.tuned_file, index=False) + print(finaldf) diff --git a/gradlib/mm_test.py b/gradlib/mm_test.py new file mode 100644 index 0000000000000..1b21b9ca105ff --- /dev/null +++ b/gradlib/mm_test.py @@ -0,0 +1,234 @@ +import torch +#import gradlib +import rocsolidxgemm +import hipbsolidxgemm +import numpy as np +import torch.nn.functional as F +import sys +import pandas as pd +#gradlib.create_extension() +rocsolidxgemm.rocb_create_extension() +hipbsolidxgemm.hipb_create_extension() + +#m = 128; n = 192 ;k = 256 +#m = 7168; k = 4096*2; n = 256 +#m = int(1024*1.25); k = int(1024*8); n = 1 +#m = 1; k = int(1024*8); n = int(1024*7) +#m=22016; k=4096 ; n=1 +#m=int(27648/1);k=5120;n=8 +#m=5120;k=13824;n=1 +m=3*5120;k=5120;n=1 + + +rtol = 1e-5 +atol = 1 +dtype = torch.float16 + +class Gemm: + def __init__(self,m,n,k,dtype=torch.float16): + self.m=m + self.k=k + self.n=n + self.dtype=dtype + self.inp = torch.randn((self.n, self.k), dtype=self.dtype, device='cuda') + self.weights = torch.randn((self.m, self.k), dtype=self.dtype, device='cuda') + self.hipb_sols=[] + self.rtol = 1e-5 + self.atol = 1 + self.cold_iters = 2 + self.warm_iters = 10 + def find_hipblas_sols(self): + sols = hipbsolidxgemm.hipb_findallsols(self.inp,self.weights.t()) + print('M N K',self.m,self.n,self.k,'>>> Total hipb solutions',len(sols)) + #print(sols) + self.hipb_sols = sols + def hipb_check_gemm_ref(self,user_solidxs=None): + ref = F.linear(self.inp,self.weights) + if user_solidxs is not None: + solidxs = user_solidxs + else: + solidxs = self.hipb_sols + if len(solidxs)>0: + for solidx in solidxs: + c = hipbsolidxgemm.hipb_mm(self.inp,self.weights.t(),solidx) + if torch.allclose(c, ref, atol=self.atol, rtol=self.rtol): + print('>>> Hipb solidx',solidx,'passed reference test') + else: + print('>>> Hipb solidx',solidx,'FAILED reference test') + print(ref) + print(c) + def hipb_time_sol(self,solidx): + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + for i in range(self.cold_iters): + c = hipbsolidxgemm.hipb_mm(self.inp,self.weights.t(),solidx) + start.record() + for i in range(self.warm_iters): + c = hipbsolidxgemm.hipb_mm(self.inp,self.weights.t(),solidx) + end.record() + torch.cuda.synchronize() + gtime = start.elapsed_time(end)/self.warm_iters + #print('>>> Solidx GTime',solidx,gtime,'ms') + return gtime + def hipb_time_all_sols(self): + gtimes = {} + for solidx in self.hipb_sols: + gtimes[solidx] = self.hipb_time_sol(solidx) + self.gtimedf = pd.DataFrame.from_dict(gtimes,orient='index',columns=['gtimems']).sort_values(by='gtimems') + self.gtimedf.to_csv('/tmp/gtimedf.csv') + print(self.gtimedf.head(10)) + + + +gemmobj = Gemm(m=3*5120,n=1,k=5120) +gemmobj.find_hipblas_sols() +#gemmobj.hipb_check_gemm_ref() +#gemmobj.hipb_check_gemm_ref(user_solidxs=[131,8190]) +#gemmobj.hipb_time_sol(gemmobj.hipb_sols[0]) +gemmobj.hipb_time_all_sols() +gemmobj.hipb_check_gemm_ref(user_solidxs=gemmobj.gtimedf.head(5).index.values) + +sys.exit() +def splitk_linear(inp,w,splitk=2): + wsp = torch.chunk(w,splitk,dim=1) + isp = torch.chunk(inp,splitk,dim=1) + print('>>>',isp[0].shape,wsp[1].shape) + cnew = [] + for i in range(splitk): + cnew.append(F.linear(isp[i],wsp[i])) + #cnew1 = F.linear(isp[1],wsp[1]) + c = cnew[0] + for i in range(1,splitk): + c.add_(cnew[i]) + #c = torch.add(cnew0,cnew1) + + return c + +def splitm_linear(inp,w,splitm=2,splits=None,splitk=1): + outputp=[] + #wsp = torch.chunk(F.pad(weights,(0,0,0,padm)),splitm) + if splits is not None: + wsp = torch.split(w,splits) + else: + wsp = torch.chunk(w,splitm) + #cout = torch.empty(inp.shape[0],w.shape[0],dtype=inp.dtype,device=inp.device) + #csp = torch.chunk(cout,splitm,dim=1) + + for i,_ in enumerate(wsp): + #print('>>>wspi',wsp[i].shape) + if splitk==1: + outputp.append(F.linear(inp, wsp[i])) + #cout[:,i*wsp[i].shape[0]:(i+1)*wsp[i].shape[0]] = F.linear(inp, wsp[i]) + #csp[i].copy_(F.linear(inp, wsp[i])) + else: + outputp.append(splitk_linear(inp,wsp[i],splitk)) + c = torch.cat((outputp),dim=1) + #print('>>>',c.shape,cout.shape) + return c + +def splitn_linear(inp,w,splitn=2,splits=None): + outputp=[] + if splits is not None: + isp = torch.split(inp,splits) + else: + isp = torch.chunk(inp,splitn) + cout = torch.empty(inp.shape[0],w.shape[0],dtype=inp.dtype,device=inp.device) + for i,_ in enumerate(isp): + outputp.append(F.linear(isp[i], w)) + #cout[i*isp[i].shape[0]:(i+1)*isp[i].shape[0],:] = F.linear(isp[i], w) + c = torch.cat((outputp),dim=0) + #print('>>>',c.shape,cout.shape) + return c + +nncount = 0 +for _ in range(10): + #a = torch.randn((m, k), dtype=dtype, device='cuda') + #b = torch.randn((k, n), dtype=dtype, device='cuda') + inp = torch.randn((n, k), dtype=dtype, device='cuda') + weights = torch.randn((m, k), dtype=dtype, device='cuda') + #c = gradlib.mm(inp, weights.t()) + c = hipbsolidxgemm.hipb_mm(inp,weights.t(),20053) + c = hipbsolidxgemm.hipb_mm(inp,weights.t(),20053) + c = rocsolidxgemm.rocb_mm(inp,weights.t(),60995) + c = rocsolidxgemm.rocb_mm(inp,weights.t(),60995) + + splitm=2 + #padm=2 + outsp=[] + #wsp = torch.chunk(F.pad(weights,(0,0,0,padm)),splitm) + #wsp = torch.chunk(weights,splitm) + #wsp = torch.split(weights,(3*1024,4*1024)) + #c = torch.empty((n,m),dtype=dtype,device='cuda') + #outtup = [] + #for i,_ in enumerate(wsp): + # print('>>>wspi',wsp[i].shape) + # outsp.append(F.linear(inp, wsp[i])) + # #outtup.append(splitk_linear(inp, wsp[i])) + #outsp = [torch.add(a,b) for a,b in outtup] + #c = torch.cat((outsp),dim=1) + #c = c[:,:-padm] + #c = splitm_linear(inp,weights,splitm=4,splits=None,splitk=1) + #c = splitn_linear(inp,weights,splitn=2,splits=None) + + #wsp = torch.chunk(weights,2,dim=1) + #isp = torch.chunk(inp,2,dim=1) + #print('>>>',isp[0].shape,wsp[1].shape) + #cnew0 = F.linear(isp[0],wsp[0]) + #cnew1 = F.linear(isp[1],wsp[1]) + #c = torch.add(cnew0,cnew1) + #c = splitk_linear(inp, weights, splitk=4) + + #torch.cuda.synchronize() + ref = F.linear(inp,weights) + #ref = torch.matmul(a,b) + if torch.allclose(c, ref, atol=atol, rtol=rtol): + nncount += 1 + else: + print(ref) + print(c) +''' +tncount = 0 +for _ in range(10): + a = torch.randn((m, k), dtype=dtype, device='cuda') + b = torch.randn((n, k), dtype=dtype, device='cuda') + c = gradlib.mm(a, b.t()) + #torch.cuda.synchronize() + ref = torch.matmul(a, b.t()) + if torch.allclose(c, ref, atol=atol, rtol=rtol): + tncount += 1 + else: + print(ref) + print(c) + #torch.save(c-ref, '/tmp/difference.pt') + #np.savetxt('my_file.txt', (c-ref).cpu().numpy()) + dfs = ref - c + nz = torch.nonzero(dfs,as_tuple=True) + print(nz) + print(dfs[nz]) + print(ref[nz]) + print(c[nz]) +''' +''' +ntcount = 0 +for _ in range(10): + a = torch.randn((k, m), dtype=dtype, device='cuda') + b = torch.randn((k, n), dtype=dtype, device='cuda') + c = gradlib.mm(a.t(), b) + #torch.cuda.synchronize() + if torch.allclose(c, torch.matmul(a.t(), b), atol=atol, rtol=rtol): + ntcount += 1 + +ttcount = 0 +for _ in range(10): + a = torch.randn((k, m), dtype=dtype, device='cuda') + b = torch.randn((n, k), dtype=dtype, device='cuda') + c = gradlib.mm(a.t(), b.t()) + torch.cuda.synchronize() + if torch.allclose(c, torch.matmul(a.t(), b.t()), atol=atol, rtol=rtol): + ttcount += 1 +''' +print(f"GEMM (m, n, k) = {n}, {m}, {k}") +print(f"NN GEMMs: pass {nncount}/10, tol={rtol}") +#print(f"TN GEMMs: pass {tncount}/10, tol={rtol}") +#print(f"NT GEMMs: pass {ntcount}/10, tol={rtol}") +#print(f"TT GEMMs: pass {ttcount}/10, tol={rtol}") diff --git a/gradlib/setup.py b/gradlib/setup.py new file mode 100644 index 0000000000000..1ca83dbe79f6c --- /dev/null +++ b/gradlib/setup.py @@ -0,0 +1,136 @@ +import torch +import setuptools +from setuptools import setup +from torch.utils.cpp_extension import BuildExtension, CUDAExtension +from torch.utils.hipify import hipify_python +import os +import subprocess +import re + +this_dir = os.path.dirname(os.path.abspath(__file__)) +#gpus = subprocess.check_output("/opt/rocm/bin/rocminfo").decode('UTF-8').split('\n') +#gpus = list(set([re.search('(gfx94.)', g).group(0) for g in gpus if 'gfx94' in g])) +gpus = ['gfx90a','gfx940','gfx941','gfx942'] +#gpus = ['gfx90a','gfx940'] +extra_args = ["--offload-arch=" + g for g in gpus] + + +#sets_rocm_pytorch = False +maj_ver, min_ver, *_ = torch.__version__.split('.') +if int(maj_ver) > 1 or (int(maj_ver) == 1 and int(min_ver) >= 5): + from torch.utils.cpp_extension import ROCM_HOME + is_rocm_pytorch = True if ((torch.version.hip is not None) and (ROCM_HOME is not None)) else False + +ext_modules = [] + +generator_flag = [] +torch_dir = torch.__path__[0] +if os.path.exists(os.path.join(torch_dir, 'include', 'ATen', 'CUDAGenerator.h')): + generator_flag = ['-DOLD_GENERATOR'] + +print("\n\ntorch.__version__ = {}\n\n".format(torch.__version__)) +TORCH_MAJOR = int(torch.__version__.split('.')[0]) +TORCH_MINOR = int(torch.__version__.split('.')[1]) + +version_ge_1_1 = [] +if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0): + version_ge_1_1 = ['-DVERSION_GE_1_1'] +version_ge_1_3 = [] +if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2): + version_ge_1_3 = ['-DVERSION_GE_1_3'] +version_ge_1_5 = [] +if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4): + version_ge_1_5 = ['-DVERSION_GE_1_5'] +version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5 + +include_dirs=[os.path.join(this_dir, 'csrc')] + +#if is_rocm_pytorch: +# import shutil +# with hipify_python.GeneratedFileCleaner(keep_intermediates=True) as clean_ctx: +# hipify_python.hipify(project_directory=this_dir, output_directory=this_dir, includes="csrc/*", +# show_detailed=True, is_pytorch_extension=True, clean_ctx=clean_ctx) + +if not is_rocm_pytorch: + ext_modules.append( + CUDAExtension( + name='gradlib', + sources=['grad_funcs.cu'], + extra_compile_args={ + 'cxx': ['-O3',], + 'nvcc':['-O3','-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', "--expt-relaxed-constexpr", "-ftemplate-depth=1024", '-gencode=arch=compute_70,code=sm_70','-gencode=arch=compute_80,code=sm_80','-gencode=arch=compute_80,code=compute_80'] + } + ) + ) +elif is_rocm_pytorch: + #if torch.__version__ <= '1.8': + hipify_ver = [int(x) for x in torch.utils.hipify.__version__.split(".")] if hasattr(torch.utils.hipify, "__version__") else [0,0,0] + if hipify_ver < [1,0,0]: + import shutil + with hipify_python.GeneratedFileCleaner(keep_intermediates=True) as clean_ctx: + hipify_python.hipify(project_directory=this_dir, output_directory=this_dir, includes="csrc/*", + show_detailed=True, is_pytorch_extension=True, clean_ctx=clean_ctx) + + ext_modules.append( + CUDAExtension( + name='gradlib', + sources=['./csrc/hip/grad_funcs.hip'], + extra_compile_args={ + 'cxx': ['-O3',] + version_dependent_macros, + 'nvcc':['-O3'] + extra_args + } + ) + ) + else: + #ext_modules.append( + # CUDAExtension( + # name='gradlib', + # sources=['./csrc/grad_funcs.cu'], + # include_dirs=include_dirs, + # # add additional libraries argument for hipblaslt + # libraries=['hipblaslt'], + # extra_compile_args={ + # 'cxx': ['-O3',], + # 'nvcc':['-O3','-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', + # "-ftemplate-depth=1024"] + extra_args + # } + # ) + # ) + ext_modules.append( + CUDAExtension( + name='rocsolidxgemm', + sources=['./csrc/rocsolgemm.cu'], + include_dirs=include_dirs, + # add additional libraries argument for hipblaslt + libraries=['rocblas'], + extra_compile_args={ + 'cxx': ['-O3',], + 'nvcc':['-O3','-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', + "-ftemplate-depth=1024"] + extra_args + } + ) + ) + ext_modules.append( + CUDAExtension( + name='hipbsolidxgemm', + sources=['./csrc/hipbsolgemm.cu'], + include_dirs=include_dirs, + # add additional libraries argument for hipblaslt + libraries=['hipblaslt'], + extra_compile_args={ + 'cxx': ['-O3',], + 'nvcc':['-O3','-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', + "-ftemplate-depth=1024"] + extra_args + } + ) + ) + +setup( + name='gradlib', + packages=['gradlib'], + ext_modules=ext_modules, + cmdclass={ + 'build_ext': BuildExtension +}) + +# python setup.py build && cp build/lib*/gradlib* ../ diff --git a/run_70b.sh b/run_70b.sh old mode 100644 new mode 100755 index 46e342826b2a7..ed004b56c17d3 --- a/run_70b.sh +++ b/run_70b.sh @@ -1,6 +1,6 @@ #!/bin/bash -BASE_DIR=/workspace -VLLM_DIR=$BASE_DIR/vllm-private +BASE_DIR=/trees +VLLM_DIR=$BASE_DIR/vllm GRAD_DIR=$BASE_DIR/gradlib RPD_DIR=/workspace/rocmProfileData MODEL=/data/llama2-70b-chat @@ -10,7 +10,7 @@ GEMM_TUNER=1 #TP="1 2 4 8" TP=8 #Flag to use Triton Flash Attention vs CK -export VLLM_USE_TRITON=1 +#export VLLM_USE_TRITON=1 #Gemm tuner flags export VLLM_TUNE_GEMM=0 @@ -21,22 +21,21 @@ export VLLM_TUNE_FILE=$VLLM_DIR"/tuned.csv" #export VLLM_USE_TORCH_MULTINOMIAL=1 #Delete tuned gemms before running. -DELETE_TUNED_CSV=1 +#DELETE_TUNED_CSV=1 #Flag to disable MSCCL #export RCCL_MSCCL_ENABLE=0 #HIPGraph performance flags export HIP_FORCE_DEV_KERNARG=1 export DEBUG_CLR_GRAPH_PACKET_CAPTURE=1 #Enable full decoder graph mode -HIP_GRAPH=--use-cuda-graph #Use top of tree build of RCCL export LD_LIBRARY_PATH=/workspace/rccl/build/ #Enable either flag to create a profile trace (rocprof, or rocpd) #RPD_PROFILE="--profile" #ROCPROF_PROFILE="rocprof --hip-trace" -GEN_LEN="1 32" -#INPUT_LEN="512 1024 2048 3072" -INPUT_LEN="512 1024 2048 3072 4096 6144 8192 16384" +GEN_LEN="1,32,128" +INPUT_LEN="512,1024,2048,3072" + ITER=10 # pring usage of the parameters usage() { @@ -57,26 +56,16 @@ for tp in $TP; do if (( $GEMM_TUNER )); then - echo "tuned_gemm_csv: ./tuned_tp$tp.csv" > $VLLM_DIR/tuned_perf_tp$tp.yaml - tuned_file=$VLLM_DIR/tuned_tp$tp.csv - if [[ $DELETE_TUNED_CSV == 1 || ! -f $VLLM_DIR/tuned_tp$tp.csv ]]; - echo "tuned_gemm_csv: "$VLLM_TUNE_FILE > $VLLM_DIR/tuned_perf.yaml + echo "tuned_gemm_csv: "$VLLM_TUNE_FILE > $VLLM_DIR/tuned_perf_tp$tp.yaml + if [[ $DELETE_TUNED_CSV == 1 ]]; then - rm -rf $tuned_file - echo "INFO: Generating Tuned Gemm configs" - cd $GRAD_DIR - python gemm_tuner.py --model_dir $MODEL --output $tuned_file --tp $tp + rm -rf $VLLM_TUNE_FILE fi - export VLLM_PERF_YAML=./tuned_perf_tp$tp.yaml + #export VLLM_PERF_YAML=./tuned_perf_tp$tp.yaml echo "INFO: Generating Tuned Gemm configs" cd $GRAD_DIR - python gemm_tuner.py --model_dir $MODEL --output $VLLM_TUNE_FILE --tp $tp - - - echo "================================= TUNED GEMMS $tuned_file ===============================================" - cat $tuned_file - + python gemm_tuner.py --model_dir $MODEL --tuned_file $VLLM_TUNE_FILE --tp $tp fi cd $VLLM_DIR @@ -91,7 +80,7 @@ do fi echo "================================= RUNNING $MODEL $input_len $gen_len ===============================================" $ROCPROF_PROFILE torchrun --standalone --nnodes=1 --nproc-per-node=$tp benchmarks/benchmark_latency.py --model $MODEL --batch-size 1 --input-len $input_len --output-len $gen_len \ - --tensor-parallel-size $tp --num-iters $ITER $HIP_GRAPH $RPD_PROFILE + --tensor-parallel-size $tp --num-iters $ITER $RPD_PROFILE --report if [[ -v ROCPROF_PROFILE ]] ; then TRACE_FILE=$BASE_DIR/trace_${MODEL_SIZE}_${input_len}_${gen_len}.json @@ -106,4 +95,4 @@ do fi done done -done \ No newline at end of file +done diff --git a/run_70b_fast.sh b/run_70b_fast.sh old mode 100644 new mode 100755 index 585e0ebdd000c..0ed20e59ca3ff --- a/run_70b_fast.sh +++ b/run_70b_fast.sh @@ -1,8 +1,8 @@ #!/bin/bash set -e -BASE_DIR=/workspace -VLLM_DIR=$BASE_DIR/vllm-private -GRAD_DIR=/trees/gradlib +BASE_DIR=/trees +VLLM_DIR=$BASE_DIR/vllm +GRAD_DIR=$BASE_DIR/gradlib RPD_DIR=/workspace/rocmProfileData MODEL=/data/llama2-70b-chat MODEL_SIZE=`echo $MODEL | sed 's/.*\(.[0-9][bB]\).*/\1/'` @@ -27,8 +27,6 @@ export VLLM_USE_TRITON=1 export HIP_FORCE_DEV_KERNARG=1 export DEBUG_CLR_GRAPH_PACKET_CAPTURE=1 -#Enable full decoder graph mode -HIP_GRAPH=--use-cuda-graph #Use top of tree build of RCCL export LD_LIBRARY_PATH=/workspace/rccl/build/ @@ -39,8 +37,8 @@ export LD_LIBRARY_PATH=/workspace/rccl/build/ #TP="1 2 4 8" TP=8 -GEN_LEN="1,32" -INPUT_LEN="512 1024 2048 3072" +GEN_LEN="32" +INPUT_LEN="512,1024,2048,3072" #INPUT_LEN="512,1024,2048,3072,4096,6144,8192,16384" BATCH_SIZE="1" ITER=10 @@ -59,11 +57,9 @@ do python $GRAD_DIR/gemm_tuner.py --tuned_file $VLLM_TUNE_FILE --input_file $VLLM_UNTUNE_FILE echo "File does not exist." fi - echo "================================= TUNED GEMMS $tuned_file ===============================================" - cat $VLLM_TUNE_FILE export VLLM_TUNE_GEMM=0 echo "================================= RUNNING $MODEL ===============================================" $ROCPROF_PROFILE torchrun --standalone --nnodes=1 --nproc-per-node=$tp benchmarks/benchmark_latency.py --model $MODEL --batch-size $BATCH_SIZE --input-len $INPUT_LEN --output-len $GEN_LEN \ - --tensor-parallel-size $tp --num-iters $ITER --report --report-file=$VLLM_DIR/report.csv $HIP_GRAPH + --tensor-parallel-size $tp --num-iters $ITER --report --report-file=$VLLM_DIR/report.csv done \ No newline at end of file diff --git a/vllm/model_executor/layers/tuned_gemm.py b/vllm/model_executor/layers/tuned_gemm.py index 6d0a3ed3f2a60..1dc0dcc9a4670 100644 --- a/vllm/model_executor/layers/tuned_gemm.py +++ b/vllm/model_executor/layers/tuned_gemm.py @@ -14,12 +14,13 @@ def __init__(self): #rocb_create_extension() #hipb_create_extension() self.extensions_created = False - self.bestsols = {} - self.load_best_sols() - self.create_ds() self.save_gemm = int(os.environ.get('VLLM_TUNE_GEMM',0)) self.untune_path = os.environ.get('VLLM_UNTUNE_FILE', "/tmp/vllm_untuned.csv") self.tune_path = os.environ.get('VLLM_TUNE_FILE', "tuned.csv") + self.bestsols = {} + self.load_best_sols() + self.create_ds() + if (self.save_gemm == 1): self.tuned_df = pd.DataFrame(columns=['M','N','K']) From 694ae1d78e93728a0c492c13e5d3a51ce6729c1d Mon Sep 17 00:00:00 2001 From: Doug Lehr Date: Thu, 15 Feb 2024 06:20:44 +0000 Subject: [PATCH 030/159] Add rpd tracer controls to benchmark_latency.py --- Dockerfile.rocm | 129 +++++++++++++------------------- benchmarks/benchmark_latency.py | 26 ++++++- 2 files changed, 75 insertions(+), 80 deletions(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 3c76305303037..08783e0a1e0bd 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -1,85 +1,56 @@ -# default base image -ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" - -FROM $BASE_IMAGE - -ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" - -RUN echo "Base image is $BASE_IMAGE" - -# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" -# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" - - +FROM rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1 +ENV WORKSPACE_DIR=/workspace +RUN mkdir -p $WORKSPACE_DIR +WORKDIR $WORKSPACE_DIR +# Limit arch's so composable kernel doesn't take days to finish +ENV PYTORCH_ROCM_ARCH=gfx90a;gfx942 ARG FA_GFX_ARCHS="gfx90a;gfx942" -RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS" - -ARG FA_BRANCH="3d2b6f5" -RUN echo "FA_BRANCH is $FA_BRANCH" - -# Install some basic utilities -RUN apt-get update && apt-get install python3 python3-pip -y - -# Install some basic utilities -RUN apt-get update && apt-get install -y \ - curl \ - ca-certificates \ - sudo \ - git \ - bzip2 \ - libx11-6 \ - build-essential \ - wget \ - unzip \ - nvidia-cuda-toolkit \ - tmux \ - && rm -rf /var/lib/apt/lists/* - -### Mount Point ### -# When launching the container, mount the code directory to /app -ARG APP_MOUNT=/app -VOLUME [ ${APP_MOUNT} ] -WORKDIR ${APP_MOUNT} - -RUN python3 -m pip install --upgrade pip -RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas - -ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer -ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin: -ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib: -ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/: - -# Install ROCm flash-attention -RUN mkdir libs \ - && cd libs \ - && git clone https://github.com/ROCmSoftwarePlatform/flash-attention.git \ +RUN apt update && apt install -y sqlite3 libsqlite3-dev libfmt-dev +RUN git clone --recursive https://github.com/ROCmSoftwarePlatform/flash-attention \ && cd flash-attention \ - && git checkout ${FA_BRANCH} \ - && git submodule update --init \ && export GPU_ARCHS=${FA_GFX_ARCHS} \ - && if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \ - patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \ - && python3 setup.py install \ - && cd .. - -COPY ./ /app/vllm - -RUN python3 -m pip install --upgrade pip -RUN python3 -m pip install xformers==0.0.23 --no-deps - -# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt. -# Manually removed it so that later steps of numpy upgrade can continue -RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \ - rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi - -RUN cd /app \ - && cd vllm \ - && pip install -U -r requirements-rocm.txt \ + && python setup.py install +RUN git clone -b develop https://github.com/ROCmSoftwarePlatform/hipBLASLt \ + && export GTest_DIR="/usr/local/lib/cmake/GTest/" \ + && cd hipBLASLt \ + && ./install.sh -idc --architecture 'gfx90a;gfx942' \ + && cd ../ && rm -rf hipBLASLt +RUN sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status +RUN sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status + +RUN git clone https://streamhsa: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@github.com/rocm/rocBLAS-internal.git \ + && export GTest_DIR="/usr/local/lib/cmake/GTest/" \ + && cd rocBLAS-internal \ + && git fetch origin 4f353a8035da38c8b8873823c09a499db777b231 \ + && git checkout 4f353a8035da38c8b8873823c09a499db777b231 \ + && ./install.sh -idc -a 'gfx90a;gfx942' \ + && cd ../ && rm -rf rocBLAS-internal + +RUN pip uninstall -y triton +RUN git clone https://github.com/ROCmSoftwarePlatform/triton.git \ + && cd triton/python && pip3 install -e . +ENV MAX_JOBS=32 +RUN cd ${WORKSPACE_DIR} \ + && git clone -b exp_bandaid https://github.com/ROCmSoftwarePlatform/rccl \ + && cd rccl && mkdir build && cd build \ + && CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/opt/rocm/ .. && make -j + +RUN pip install xformers==0.0.23 --no-deps +ADD ./ $WORKSPACE_DIR/vllm + +RUN cd vllm \ + && pip install -r requirements-rocm.txt \ + && pip install typing-extensions==4.8.0 \ && bash patch_xformers.rocm.sh \ - && python3 setup.py install \ - && cd .. + && cd gradlib && python setup.py develop && cd ../ \ + && python setup.py build && python setup.py develop; exit 0 + +RUN pip install pyarrow Ray pandas==2.0 numpy==1.20.3 -RUN python3 -m pip install --upgrade pip -RUN python3 -m pip install --no-cache-dir ray[all] +RUN git clone https://github.com/ROCmSoftwarePlatform/rocmProfileData.git \ + && cd rocmProfileData && make; make install -CMD ["/bin/bash"] +COPY docker/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6.0.60000 +COPY docker/libfile_plugin.so /opt/rocm/lib/roctracer +COPY docker/run_13b.sh $WORKSPACE_DIR/ +COPY docker/run_70b.sh $WORKSPACE_DIR/ diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index f9b49ebfaa132..e4d70851e46ef 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -71,6 +71,11 @@ def run_to_completion(profile_dir: Optional[str] = None): print("Warming up...") run_to_completion(profile_dir=None) + + if (args.warmup_only): + + print(">>> Warmup only specified, exiting") + continue if args.profile: profile_dir = args.profile_result_dir @@ -81,12 +86,31 @@ def run_to_completion(profile_dir: Optional[str] = None): print(f"Profiling (results will be saved to '{profile_dir}')...") run_to_completion(profile_dir=args.profile_result_dir) return + if args.rpd: + from rpdTracerControl import rpdTracerControl + rpdTracerControl.setFilename(name = "/workspace/trace.rpd", append=True) + profile_rpd = rpdTracerControl() + profile_rpd.start() + print(f"RPD Profiling'...") + run_to_completion(profile_dir=None) + profile_rpd.stop() + return # Benchmark. latencies = [] for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): latencies.append(run_to_completion(profile_dir=None)) - print(f'Avg latency: {np.mean(latencies)} seconds') + + if torch.distributed.get_rank() == 0: + #results_df = pd.DataFrame(columns=['model', 'batch', 'tp', 'input', 'output', 'latency']) + latency=np.mean(latencies) + print(f'Avg latency: {latency} seconds') + if args.report: + entry = {'model':[args.model], 'tp':[args.tensor_parallel_size],'batch':[batch_size], 'input':[input_len], 'output':[output_len], 'latency':[latency]} + results_df = pd.concat([results_df, pd.DataFrame(entry)], ignore_index=True) + if torch.distributed.get_rank() == 0 and args.report: + print(results_df) + results_df.to_csv(args.report_file, index=False) From eaf08ff0003639553325edf56b1d83a813a210e8 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Thu, 15 Feb 2024 18:08:50 +0000 Subject: [PATCH 031/159] Initial conversion back to KV cache scales in model, using float scaling factor instead of Tensor, should be working out the box --- benchmarks/benchmark_throughput.py | 16 +++++- vllm/config.py | 13 +++-- vllm/engine/arg_utils.py | 3 +- vllm/engine/llm_engine.py | 2 +- vllm/model_executor/layers/attention.py | 33 ++++++++---- .../layers/triton_kernel/prefix_prefill.py | 6 ++- vllm/model_executor/models/aquila.py | 7 ++- vllm/model_executor/models/baichuan.py | 7 ++- vllm/model_executor/models/bloom.py | 7 ++- vllm/model_executor/models/chatglm.py | 5 +- vllm/model_executor/models/deepseek.py | 7 ++- vllm/model_executor/models/falcon.py | 7 ++- vllm/model_executor/models/gpt2.py | 5 +- vllm/model_executor/models/gpt_bigcode.py | 5 +- vllm/model_executor/models/gpt_j.py | 7 ++- vllm/model_executor/models/gpt_neox.py | 7 ++- vllm/model_executor/models/internlm.py | 7 ++- vllm/model_executor/models/internlm2.py | 7 ++- vllm/model_executor/models/llama.py | 16 ++++-- vllm/model_executor/models/mistral.py | 7 ++- vllm/model_executor/models/mixtral.py | 7 ++- vllm/model_executor/models/mixtral_quant.py | 7 ++- vllm/model_executor/models/mpt.py | 7 ++- vllm/model_executor/models/opt.py | 6 +-- vllm/model_executor/models/phi.py | 7 ++- vllm/model_executor/models/qwen.py | 7 ++- vllm/model_executor/models/qwen2.py | 7 ++- vllm/model_executor/models/stablelm.py | 7 ++- vllm/model_executor/models/yi.py | 7 ++- vllm/model_executor/weight_utils.py | 11 ++-- vllm/worker/cache_engine.py | 50 +++---------------- vllm/worker/model_runner.py | 22 +++++++- 32 files changed, 154 insertions(+), 165 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 6102d360ee9a1..28a237e962a1a 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -72,6 +72,7 @@ def run_vllm( max_model_len: Optional[int], enforce_eager: bool, kv_cache_dtype: str, + kv_cache_scales: Optional[str], ) -> float: from vllm import LLM, SamplingParams llm = LLM( @@ -85,6 +86,7 @@ def run_vllm( max_model_len=max_model_len, enforce_eager=enforce_eager, kv_cache_dtype=kv_cache_dtype, + kv_cache_scales=kv_cache_scales, ) # Add the requests to the engine. @@ -106,8 +108,9 @@ def run_vllm( start = time.perf_counter() # FIXME(woosuk): Do not use internal method. - llm._run_engine(use_tqdm=True) + outputs = llm._run_engine(use_tqdm=True) end = time.perf_counter() + print(outputs[-1]) return end - start @@ -209,7 +212,7 @@ def main(args: argparse.Namespace): args.seed, args.n, args.use_beam_search, args.trust_remote_code, args.dtype, args.max_model_len, args.enforce_eager, - args.kv_cache_dtype) + args.kv_cache_dtype, args.kv_cache_scales) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -295,6 +298,15 @@ def main(args: argparse.Namespace): help='Data type for kv cache storage. If "auto", will use model data type. ' 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') + parser.add_argument( + '--kv-cache-scales', + type=str, + default=None, + help='Path to the JSON file containing the KV cache scaling factors. ' + 'This should generally be supplied when KV cache dtype is FP8. Otherwise ' + 'the KV cache scaling factors default to 1.0, which will likely cause ' + 'accuracy issues. Note FP8 is not supported when cuda version is ' + 'lower than 11.8.') args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/vllm/config.py b/vllm/config.py index b80385f23ef43..25ad9743373bd 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -27,6 +27,10 @@ class ModelConfig: downloading the model and tokenizer. download_dir: Directory to download and load the weights, default to the default cache directory of huggingface. + kv_cache_scales: Path to file containing a JSON serialization of a map + of layer indices to their respective KV cache scaling factors. Used to + load aforementioned scaling factors into the model when KV cache type + is FP8. load_format: The format of the model weights to load: "auto" will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is @@ -66,6 +70,7 @@ def __init__( tokenizer_mode: str, trust_remote_code: bool, download_dir: Optional[str], + kv_cache_scales: Optional[str], load_format: str, dtype: Union[str, torch.dtype], seed: int, @@ -81,6 +86,7 @@ def __init__( self.tokenizer_mode = tokenizer_mode self.trust_remote_code = trust_remote_code self.download_dir = download_dir + self.kv_cache_scales = kv_cache_scales self.load_format = load_format self.seed = seed self.revision = revision @@ -285,7 +291,6 @@ def __init__( gpu_memory_utilization: float, swap_space: int, cache_dtype: str, - kv_cache_scales: Optional[str] = None, sliding_window: Optional[int] = None, ) -> None: self.block_size = block_size @@ -293,7 +298,6 @@ def __init__( self.swap_space_bytes = swap_space * _GB self.cache_dtype = cache_dtype self.sliding_window = sliding_window - self.kv_cache_scales = kv_cache_scales self._verify_args() self._verify_cache_dtype() @@ -311,11 +315,6 @@ def _verify_cache_dtype(self) -> None: if self.cache_dtype == "auto": pass elif self.cache_dtype == "fp8": - if self.kv_cache_scales is None: - logger.warn(f"Using cache dtype {self.cache_dtype} but no " - "scaling factors provided. Defaulting to 1.0 " - "scales, be warned that this might lead to " - "inaccurate results!") if not is_hip(): nvcc_cuda_version = get_nvcc_cuda_version() if nvcc_cuda_version < Version("11.8"): diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index f402396b333fc..b070029579a92 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -281,7 +281,7 @@ def create_engine_configs( Optional[LoRAConfig]]: model_config = ModelConfig(self.model, self.tokenizer, self.tokenizer_mode, self.trust_remote_code, - self.download_dir, self.load_format, + self.download_dir, self.kv_cache_scales, self.load_format, self.dtype, self.seed, self.revision, self.tokenizer_revision, self.max_model_len, self.quantization, self.enforce_eager, @@ -289,7 +289,6 @@ def create_engine_configs( cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, - self.kv_cache_scales, model_config.get_sliding_window()) parallel_config = ParallelConfig(self.pipeline_parallel_size, self.tensor_parallel_size, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index f0d5724071a99..bee73b56e0a40 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -85,7 +85,7 @@ def __init__( f"quantization={model_config.quantization}, " f"enforce_eager={model_config.enforce_eager}, " f"kv_cache_dtype={cache_config.cache_dtype}, " - f"kv_cache_scales={cache_config.kv_cache_scales}, " + f"kv_cache_scales={model_config.kv_cache_scales}, " f"seed={model_config.seed})") # TODO(woosuk): Print more configs in debug mode. diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index 2db2b517d52e1..7df3dd3992085 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -12,6 +12,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.triton_kernel.prefix_prefill import ( context_attention_fwd) +from vllm.model_executor.layers.linear import set_weight_attrs from vllm.utils import is_hip _SUPPORTED_HEAD_SIZES = [64, 80, 96, 112, 128, 256] @@ -50,6 +51,11 @@ def __init__( if alibi_slopes is not None: alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) self.register_buffer("alibi_slopes", alibi_slopes, persistent=False) + + # This will be set to a float by the model initialization + # if and only if we are using it. Note that this implies we are + # supporting only scalar per-tensor scaling factors for now. + self.kv_cache_scaling_factor = None assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads @@ -65,7 +71,6 @@ def forward( value: torch.Tensor, key_cache: Optional[torch.Tensor], value_cache: Optional[torch.Tensor], - kv_cache_scaling_factor: Optional[torch.Tensor], input_metadata: InputMetadata, ) -> torch.Tensor: """PagedAttention forward pass. @@ -87,17 +92,21 @@ def forward( query = query.view(-1, self.num_heads, self.head_size) key = key.view(-1, self.num_kv_heads, self.head_size) value = value.view(-1, self.num_kv_heads, self.head_size) - + + # Store this here as it will be modified if we perform KV cache scaling + softmax_scale = self.scale # Reshape the keys and values and store them in the cache. # If key_cache and value_cache are not provided, the new key and value # vectors will not be cached. This happens during the initial memory # profiling run. if key_cache is not None and value_cache is not None: - if kv_cache_scaling_factor is not None: - # Scale the key and value scaling factors for quantization - # by cache ops - key = key.div_(kv_cache_scaling_factor) - value = value.div_(kv_cache_scaling_factor) + # Pre-scale K, V tensors; quantization done by cache_ops + # We will correct for the effects of scaling later + if self.kv_cache_scaling_factor is not None: + key.div_(self.kv_cache_scaling_factor) + value.div_(self.kv_cache_scaling_factor) + # This corrects for the K-tensor scaling. + softmax_scale *= self.kv_cache_scaling_factor cache_ops.reshape_and_cache( key, value, @@ -160,7 +169,7 @@ def forward( value, attn_bias=input_metadata.attn_bias, p=0.0, - scale=self.scale, + scale=softmax_scale, op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if (is_hip()) else None, ) @@ -181,6 +190,7 @@ def forward( input_metadata.context_lens, input_metadata.max_seq_len, getattr(self, "alibi_slopes", None), + softmax_scale, ) else: @@ -191,10 +201,13 @@ def forward( value_cache, input_metadata, self.num_kv_heads, - self.scale, + softmax_scale, self.alibi_slopes, ) - + # Correct for the V tensor scaling if it took place + if key_cache is not None and value_cache is not None and \ + self.kv_cache_scaling_factor is not None: + output.mul_(self.kv_cache_scaling_factor) # Reshape the output tensor. return output.view(batch_size, seq_len, hidden_size) diff --git a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py index 8fa70054f02ca..687997c6f9338 100644 --- a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py +++ b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py @@ -617,14 +617,16 @@ def context_attention_fwd(q, b_seq_len, b_ctx_len, max_input_len, - alibi_slopes=None): + alibi_slopes=None, + sm_scale=None): BLOCK = 128 # shape constraints Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1] assert Lq == Lk and Lk == Lv assert Lk in {16, 32, 64, 128} - sm_scale = 1.0 / (Lq**0.5) + if sm_scale is None: + sm_scale = 1.0 / (Lq**0.5) batch, head = b_seq_len.shape[0], q.shape[1] grid = (batch, head, triton.cdiv(max_input_len, BLOCK)) # batch, head, diff --git a/vllm/model_executor/models/aquila.py b/vllm/model_executor/models/aquila.py index 4cf15dd7585b5..2f2bd5ffb4a63 100644 --- a/vllm/model_executor/models/aquila.py +++ b/vllm/model_executor/models/aquila.py @@ -45,7 +45,7 @@ from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs.aquila import AquilaConfig -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class AquilaMLP(nn.Module): @@ -162,9 +162,8 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 83bd391b108eb..f08c3c8d257ff 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -44,7 +44,7 @@ from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs.baichuan import BaiChuanConfig -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: @@ -177,9 +177,8 @@ def forward( q, k, v = qkv.chunk(chunks=3, dim=-1) if self.postion_embedding != "ALIBI": q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index f9954849bc081..4adfb6b78102f 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -40,7 +40,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: @@ -122,9 +122,8 @@ def forward( del position_ids # Unused. qkv, _ = self.query_key_value(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.dense(attn_output) return output diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 3fc7cafaa006f..dca8d724f976b 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -28,7 +28,7 @@ from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs import ChatGLMConfig -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class GLMAttention(nn.Module): @@ -104,14 +104,13 @@ def forward( qkv, _ = self.query_key_value(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(position_ids, q, k) - key_cache, value_cache, kv_cache_scaling_factor = kv_cache + key_cache, value_cache = kv_cache context_layer = self.attn( q, k, v, key_cache, value_cache, - kv_cache_scaling_factor, input_metadata, ) attn_output, _ = self.dense(context_layer) diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index 683168679bfa4..fc727b8e661b3 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -51,7 +51,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class DeepseekMLP(nn.Module): @@ -253,9 +253,8 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index e7e8271e01c5b..2b5e022312e3b 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -47,7 +47,7 @@ from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs import RWConfig -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] FalconConfig = Union[HF_FalconConfig, RWConfig] @@ -185,9 +185,8 @@ def forward( q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) if self.use_rotary: q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) attn_output, bias = self.dense(attn_output) return attn_output, bias diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 6fcb13f177113..661da0fe0434e 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -40,7 +40,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class GPT2Attention(nn.Module): @@ -85,9 +85,8 @@ def forward( ) -> torch.Tensor: qkv, _ = self.c_attn(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) - key_cache, value_cache, kv_cache_scaling_factor = kv_cache + key_cache, value_cache = kv_cache attn_output = self.attn(q, k, v, key_cache, value_cache, - kv_cache_scaling_factor, input_metadata) attn_output, _ = self.c_proj(attn_output) return attn_output diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index 79993d938f571..ef4c1d4143c88 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -41,7 +41,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class GPTBigCodeAttention(nn.Module): @@ -104,9 +104,8 @@ def forward( ], dim=-1, ) - key_cache, value_cache, kv_cache_scaling_factor = kv_cache + key_cache, value_cache = kv_cache attn_output = self.attn(q, k, v, key_cache, value_cache, - kv_cache_scaling_factor, input_metadata) attn_output, _ = self.c_proj(attn_output) return attn_output diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 642a53562662a..5bab30d9d442e 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -40,7 +40,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class GPTJAttention(nn.Module): @@ -98,9 +98,8 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) q, k = self.rotary_emb(position_ids, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) attn_output, _ = self.out_proj(attn_output) return attn_output diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 1ffb4aa6fc8df..8f7e1063e0c1d 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -40,7 +40,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class GPTNeoXAttention(nn.Module): @@ -99,9 +99,8 @@ def forward( qkv, _ = self.query_key_value(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) q, k = self.rotary_emb(position_ids, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.dense(attn_output) return output diff --git a/vllm/model_executor/models/internlm.py b/vllm/model_executor/models/internlm.py index 0ba01243e4eac..5d0b93793c89d 100644 --- a/vllm/model_executor/models/internlm.py +++ b/vllm/model_executor/models/internlm.py @@ -24,7 +24,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class InternLMMLP(nn.Module): @@ -114,9 +114,8 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index ac19238992d4d..ebf1d8a89a022 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -24,7 +24,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class InternLM2MLP(nn.Module): @@ -129,9 +129,8 @@ def forward( qkv, _ = self.wqkv(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.wo(attn_output) return output diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index acdd30d3d75b4..3793a148d9f68 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -43,11 +43,12 @@ get_tensor_model_parallel_world_size) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) + hf_model_weights_iterator, + kv_cache_scales_iterator) from vllm.sequence import SamplerOutput from vllm.config import LoRAConfig -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class LlamaMLP(nn.Module): @@ -152,9 +153,8 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.o_proj(attn_output) return output @@ -358,3 +358,9 @@ def load_weights(self, weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + + # Should not be called unless the KV cache dtype is FP8 + def load_kv_cache_scales(self, filename: str) -> None: + for layer_idx, scaling_factor in kv_cache_scales_iterator(filename): + layer_paged_attn = self.model.layers[layer_idx].self_attn.attn + layer_paged_attn.kv_cache_scaling_factor = scaling_factor diff --git a/vllm/model_executor/models/mistral.py b/vllm/model_executor/models/mistral.py index 29dfe0e0e0903..01cde67844122 100644 --- a/vllm/model_executor/models/mistral.py +++ b/vllm/model_executor/models/mistral.py @@ -47,7 +47,7 @@ from vllm.sequence import SamplerOutput from vllm.config import LoRAConfig -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class MistralMLP(nn.Module): @@ -150,9 +150,8 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index fd6e260177f7e..a8e470395b904 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -51,7 +51,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class MixtralMoE(nn.Module): @@ -222,9 +222,8 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index b72bfe869be14..a8dadce24aa1d 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -51,7 +51,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class MixtralMLP(nn.Module): @@ -232,9 +232,8 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 28bdc3301c67f..22a876e2ef691 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -24,7 +24,7 @@ from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs.mpt import MPTConfig -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] def _get_alibi_slopes( @@ -126,9 +126,8 @@ def forward( if self.qk_ln: q = self.q_ln(q) k = self.k_ln(k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.out_proj(attn_output) return output diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 0caa424fb2a36..393b2dcabcd5a 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -41,7 +41,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class OPTLearnedPositionalEmbedding(nn.Embedding): @@ -101,9 +101,9 @@ def forward( ) -> torch.Tensor: qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) - key_cache, value_cache, kv_cache_scaling_factor = kv_cache + key_cache, value_cache = kv_cache attn_output = self.attn(q, k, v, key_cache, value_cache, - kv_cache_scaling_factor, input_metadata) + input_metadata) output, _ = self.out_proj(attn_output) return output diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index b6591f51958cb..d143261968288 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -59,7 +59,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class PhiAttention(nn.Module): @@ -120,9 +120,8 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) q, k = self.rotary_emb(position_ids, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.dense(attn_output) return output diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index b370125b64796..fbc7320fb45a4 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -29,7 +29,7 @@ from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs.qwen import QWenConfig -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class QWenMLP(nn.Module): @@ -116,9 +116,8 @@ def forward( qkv, _ = self.c_attn(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.c_proj(attn_output) return output diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 9bbfd4864e743..e823e6f8c3dbe 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -47,7 +47,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class Qwen2MLP(nn.Module): @@ -151,9 +151,8 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 8112c610f23f8..95e5ad8ede63e 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -41,7 +41,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class StablelmMLP(nn.Module): @@ -136,9 +136,8 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/yi.py b/vllm/model_executor/models/yi.py index 1dfe566097ed0..53daa6c4cd939 100644 --- a/vllm/model_executor/models/yi.py +++ b/vllm/model_executor/models/yi.py @@ -46,7 +46,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class YiMLP(nn.Module): @@ -150,9 +150,8 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index 0583fd3d930ef..b753c74f1a57e 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -265,12 +265,11 @@ def hf_model_weights_iterator( def kv_cache_scales_iterator(filename: str) -> Iterator[Tuple[int, torch.Tensor]]: """ A simple utility to read in KV cache scaling factors that have been - previously serialized to disk. Used by the CacheEngine to populate its - caches with the appropriate scaling factors. The first object of the pair - is the cache (and model) layer corresponding to the scaling factor, and the - second is the scaling factor itself. Keep this function in sync with the output - of 3rdparty/quantization/extract_scales.py and with the scaling factor structure - assumed to hold in worker/cache_engine.py + previously serialized to disk. Used by the model to populate the appropriate + KV cache scaling factors. The first object of the pair is the cache (and model) + layer corresponding to the scaling factor, and the second is the scaling factor + itself. Keep this function in sync with the output of + 3rdparty/quantization/extract_scales.py """ with open(filename) as f: layer_scale_factor_map = json.load(f, parse_int=int, parse_constant=float) diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 9b177c8ef3671..5e5eb0d521ef7 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -1,17 +1,16 @@ """CacheEngine class for managing the KV cache.""" -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Tuple import torch from vllm._C import cache_ops from vllm.config import CacheConfig, ModelConfig, ParallelConfig from vllm.logger import init_logger -from vllm.model_executor.weight_utils import kv_cache_scales_iterator from vllm.utils import in_wsl, STR_DTYPE_TO_TORCH_DTYPE logger = init_logger(__name__) -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class CacheEngine: @@ -44,20 +43,11 @@ def __init__( self.dtype = model_config.dtype else: self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] - - # We enable cache scaling factors if and only if cache is FP8-typed - self.use_scaling_factor = torch.tensor([], dtype=self.dtype).element_size() == 1 # Initialize the cache. self.gpu_cache = self.allocate_gpu_cache() self.cpu_cache = self.allocate_cpu_cache() - # Load scaling factors into the GPU cache if values are specified - # We do not need to load them into the CPU cache because they are - # never swapped out. - if self.cache_config.kv_cache_scales is not None: - self.load_kv_cache_scales(self.cache_config.kv_cache_scales) - # Initialize the stream for caching operations. self.cache_stream = torch.cuda.Stream() assert self.cache_stream != torch.cuda.current_stream() @@ -96,15 +86,7 @@ def allocate_gpu_cache(self) -> List[KVCache]: dtype=self.dtype, device="cuda", ) - if self.use_scaling_factor: - scaling_factor = torch.ones( - 1, - dtype=torch.float32, - device='cuda', - ) - else: - scaling_factor = None - gpu_cache.append((key_blocks, value_blocks, scaling_factor)) + gpu_cache.append((key_blocks, value_blocks)) return gpu_cache def allocate_cpu_cache(self) -> List[KVCache]: @@ -128,9 +110,7 @@ def allocate_cpu_cache(self) -> List[KVCache]: dtype=self.dtype, pin_memory=pin_memory, ) - # Scale factors are not involved in the swap process and never need to reside on CPU - scaling_factor = None - cpu_cache.append((key_blocks, value_blocks, scaling_factor)) + cpu_cache.append((key_blocks, value_blocks)) return cpu_cache def _swap( @@ -141,13 +121,8 @@ def _swap( ) -> None: with torch.cuda.stream(self.cache_stream): for i in range(self.num_layers): - src_key_cache, src_value_cache, src_scaling = src[i] - dst_key_cache, dst_value_cache, dst_scaling = dst[i] - # We should not need to copy scaling factors, as they are equal - # given a fixed layer - # TODO(mattwong) Remove this once confirmed - if self.use_scaling_factor: - assert torch.equal(src_scaling, dst_scaling) + src_key_cache, src_value_cache = src[i] + dst_key_cache, dst_value_cache = dst[i] # Copy the key blocks. cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst) # Copy the value blocks. @@ -167,14 +142,6 @@ def copy(self, src_to_dsts: Dict[int, List[int]]) -> None: value_caches = [value_cache for _, value_cache, _ in self.gpu_cache] # NOTE(woosuk): This operation implicitly synchronizes the CPU and GPU. cache_ops.copy_blocks(key_caches, value_caches, src_to_dsts) - - # Helper function to load in static KV cache scaling factors (one per layer) - # stored in a given file. These scaling factors are assumed to not take up - # too much space and are hence permanently resident on GPU. - def load_kv_cache_scales(self, filename: str): - for layer_idx, scaling_factor in kv_cache_scales_iterator(filename): - self.gpu_cache[layer_idx][2].copy_(scaling_factor) - @staticmethod def get_cache_block_size( @@ -195,11 +162,6 @@ def get_cache_block_size( else: dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype] dtype_size = _get_dtype_size(dtype) - - use_scaling_factor = dtype_size == 1 - if use_scaling_factor: - return dtype_size * total + num_layers * 4 - return dtype_size * total diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 3d63ab8613a79..551c25be320ca 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -20,7 +20,7 @@ logger = init_logger(__name__) -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] _PAD_SLOT_ID = -1 LORA_WARMUP_RANK = 8 # Capture graphs for batch size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256. @@ -83,6 +83,24 @@ def load_model(self) -> None: self.scheduler_config.max_paddings, vocab_size, self.lora_config, self.device) self.model = self.lora_manager.create_lora_manager(self.model) + + if self.model_config.kv_cache_scales is not None: + if self.kv_cache_dtype == "fp8": + if callable(getattr(self.model, "load_kv_cache_scales", None)): + self.model.load_kv_cache_scales(self.model_config.kv_cache_scales) + else: + logger.warn("Using FP8 KV cache and scaling factors provided but " + f"model {self.model.__class__} does not support " + "loading scaling factors. Defaulting to 1.0 scales, " + "be warned that this might lead to inaccurate " + "results!") + else: + logger.warn("User provided KV cache scaling factors but these will " + "not be used as the KV cache dtype is not FP8!") + elif self.kv_cache_dtype == "fp8": + logger.warn(f"Using FP8 KV cache but no scaling factors provided. " + "Defaulting to 1.0 scales, be warned that this might " + "lead to inaccurate results!") def set_block_size(self, block_size: int) -> None: self.block_size = block_size @@ -595,7 +613,7 @@ def profile_run(self) -> None: # Run the model with the dummy inputs. num_layers = self.model_config.get_num_layers(self.parallel_config) - kv_caches = [(None, None, None)] * num_layers + kv_caches = [(None, None)] * num_layers self.execute_model(seqs, kv_caches) torch.cuda.synchronize() return From 0a80226f60690323c8a29e7d93ff9350c6eadcfc Mon Sep 17 00:00:00 2001 From: HaiShaw Date: Mon, 19 Feb 2024 16:15:09 -0800 Subject: [PATCH 032/159] Completing KV cache scaling factors ingest (TP>1 todo), clean up code; Isolating math works --- benchmarks/benchmark_latency.py | 10 ++++++++ benchmarks/benchmark_throughput.py | 22 ++++++++--------- vllm/config.py | 8 +++---- vllm/engine/arg_utils.py | 16 ++++++------- vllm/engine/llm_engine.py | 2 +- vllm/model_executor/layers/attention.py | 24 ++++--------------- .../layers/triton_kernel/prefix_prefill.py | 6 ++--- vllm/model_executor/models/llama.py | 2 +- vllm/worker/cache_engine.py | 4 ++-- vllm/worker/model_runner.py | 20 +++++++--------- 10 files changed, 53 insertions(+), 61 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 1a910d9775fa2..d2f520c39a91b 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -25,6 +25,7 @@ def main(args: argparse.Namespace): dtype=args.dtype, enforce_eager=args.enforce_eager, kv_cache_dtype=args.kv_cache_dtype, + kv_cache_scales_path=args.kv_cache_scales_path, ) sampling_params = SamplingParams( @@ -126,6 +127,15 @@ def run_to_completion(profile_dir: Optional[str] = None): help='Data type for kv cache storage. If "auto", will use model data type. ' 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') + parser.add_argument( + '--kv-cache-scales-path', + type=str, + default=None, + help='Path to the JSON files containing the KV cache scaling factors. ' + 'This should generally be supplied, when KV cache dtype is FP8. Otherwise, ' + 'KV cache scaling factors default to 1.0, which may cause accuracy issues. ' + 'FP8_E5M2 (withour scaling) is only supported on cuda version greater than 11.8. ' + 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') parser.add_argument( '--profile', action='store_true', diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 28a237e962a1a..902e2f4001c47 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -72,7 +72,7 @@ def run_vllm( max_model_len: Optional[int], enforce_eager: bool, kv_cache_dtype: str, - kv_cache_scales: Optional[str], + kv_cache_scales_path: Optional[str], ) -> float: from vllm import LLM, SamplingParams llm = LLM( @@ -86,7 +86,7 @@ def run_vllm( max_model_len=max_model_len, enforce_eager=enforce_eager, kv_cache_dtype=kv_cache_dtype, - kv_cache_scales=kv_cache_scales, + kv_cache_scales_path=kv_cache_scales_path, ) # Add the requests to the engine. @@ -212,7 +212,7 @@ def main(args: argparse.Namespace): args.seed, args.n, args.use_beam_search, args.trust_remote_code, args.dtype, args.max_model_len, args.enforce_eager, - args.kv_cache_dtype, args.kv_cache_scales) + args.kv_cache_dtype, args.kv_cache_scales_path) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -299,14 +299,14 @@ def main(args: argparse.Namespace): 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') parser.add_argument( - '--kv-cache-scales', - type=str, - default=None, - help='Path to the JSON file containing the KV cache scaling factors. ' - 'This should generally be supplied when KV cache dtype is FP8. Otherwise ' - 'the KV cache scaling factors default to 1.0, which will likely cause ' - 'accuracy issues. Note FP8 is not supported when cuda version is ' - 'lower than 11.8.') + '--kv-cache-scales-path', + type=str, + default=None, + help='Path to the JSON files containing the KV cache scaling factors. ' + 'This should generally be supplied, when KV cache dtype is FP8. Otherwise, ' + 'KV cache scaling factors default to 1.0, which may cause accuracy issues. ' + 'FP8_E5M2 (withour scaling) is only supported on cuda version greater than 11.8. ' + 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/vllm/config.py b/vllm/config.py index 25ad9743373bd..bb0308d9add16 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -27,10 +27,10 @@ class ModelConfig: downloading the model and tokenizer. download_dir: Directory to download and load the weights, default to the default cache directory of huggingface. - kv_cache_scales: Path to file containing a JSON serialization of a map + kv_cache_scales_path: Path to files containing JSON serialization of a map of layer indices to their respective KV cache scaling factors. Used to load aforementioned scaling factors into the model when KV cache type - is FP8. + is FP8_E4M3 on ROCm (AMD GPU). load_format: The format of the model weights to load: "auto" will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is @@ -70,7 +70,7 @@ def __init__( tokenizer_mode: str, trust_remote_code: bool, download_dir: Optional[str], - kv_cache_scales: Optional[str], + kv_cache_scales_path: Optional[str], load_format: str, dtype: Union[str, torch.dtype], seed: int, @@ -86,7 +86,7 @@ def __init__( self.tokenizer_mode = tokenizer_mode self.trust_remote_code = trust_remote_code self.download_dir = download_dir - self.kv_cache_scales = kv_cache_scales + self.kv_cache_scales_path = kv_cache_scales_path self.load_format = load_format self.seed = seed self.revision = revision diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b070029579a92..b03c39858fb22 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -18,7 +18,7 @@ class EngineArgs: load_format: str = 'auto' dtype: str = 'auto' kv_cache_dtype: str = 'auto' - kv_cache_scales: str = None + kv_cache_scales_path: str = None seed: int = 0 max_model_len: Optional[int] = None worker_use_ray: bool = False @@ -133,14 +133,14 @@ def add_cli_args( 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') parser.add_argument( - '--kv-cache-scales', + '--kv-cache-scales-path', type=str, default=None, - help='Path to the JSON file containing the KV cache scaling factors. ' - 'This should generally be supplied when KV cache dtype is FP8. Otherwise ' - 'the KV cache scaling factors default to 1.0, which will likely cause ' - 'accuracy issues. Note FP8 is not supported when cuda version is ' - 'lower than 11.8.') + help='Path to the JSON files containing the KV cache scaling factors. ' + 'This should generally be supplied, when KV cache dtype is FP8. Otherwise, ' + 'KV cache scaling factors default to 1.0, which may cause accuracy issues. ' + 'FP8_E5M2 (withour scaling) is only supported on cuda version greater than 11.8. ' + 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') parser.add_argument('--max-model-len', type=int, default=None, @@ -281,7 +281,7 @@ def create_engine_configs( Optional[LoRAConfig]]: model_config = ModelConfig(self.model, self.tokenizer, self.tokenizer_mode, self.trust_remote_code, - self.download_dir, self.kv_cache_scales, self.load_format, + self.download_dir, self.kv_cache_scales_path, self.load_format, self.dtype, self.seed, self.revision, self.tokenizer_revision, self.max_model_len, self.quantization, self.enforce_eager, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index bee73b56e0a40..0019e9b94b3a2 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -85,7 +85,7 @@ def __init__( f"quantization={model_config.quantization}, " f"enforce_eager={model_config.enforce_eager}, " f"kv_cache_dtype={cache_config.cache_dtype}, " - f"kv_cache_scales={model_config.kv_cache_scales}, " + f"kv_cache_scales_path={model_config.kv_cache_scales_path}, " f"seed={model_config.seed})") # TODO(woosuk): Print more configs in debug mode. diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index 7df3dd3992085..5fc69f3593114 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -52,9 +52,9 @@ def __init__( alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) self.register_buffer("alibi_slopes", alibi_slopes, persistent=False) - # This will be set to a float by the model initialization - # if and only if we are using it. Note that this implies we are - # supporting only scalar per-tensor scaling factors for now. + # This will be set to a float by model initialization per attention, + # if and only if we are using it. N.B. currently we only support per + # tensor scalar scaling factors & only applicable to ROCm (AMD GPU). self.kv_cache_scaling_factor = None assert self.num_heads % self.num_kv_heads == 0 @@ -93,20 +93,11 @@ def forward( key = key.view(-1, self.num_kv_heads, self.head_size) value = value.view(-1, self.num_kv_heads, self.head_size) - # Store this here as it will be modified if we perform KV cache scaling - softmax_scale = self.scale # Reshape the keys and values and store them in the cache. # If key_cache and value_cache are not provided, the new key and value # vectors will not be cached. This happens during the initial memory # profiling run. if key_cache is not None and value_cache is not None: - # Pre-scale K, V tensors; quantization done by cache_ops - # We will correct for the effects of scaling later - if self.kv_cache_scaling_factor is not None: - key.div_(self.kv_cache_scaling_factor) - value.div_(self.kv_cache_scaling_factor) - # This corrects for the K-tensor scaling. - softmax_scale *= self.kv_cache_scaling_factor cache_ops.reshape_and_cache( key, value, @@ -169,7 +160,7 @@ def forward( value, attn_bias=input_metadata.attn_bias, p=0.0, - scale=softmax_scale, + scale=self.scale, op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if (is_hip()) else None, ) @@ -190,7 +181,6 @@ def forward( input_metadata.context_lens, input_metadata.max_seq_len, getattr(self, "alibi_slopes", None), - softmax_scale, ) else: @@ -201,13 +191,9 @@ def forward( value_cache, input_metadata, self.num_kv_heads, - softmax_scale, + self.scale, self.alibi_slopes, ) - # Correct for the V tensor scaling if it took place - if key_cache is not None and value_cache is not None and \ - self.kv_cache_scaling_factor is not None: - output.mul_(self.kv_cache_scaling_factor) # Reshape the output tensor. return output.view(batch_size, seq_len, hidden_size) diff --git a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py index 687997c6f9338..8fa70054f02ca 100644 --- a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py +++ b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py @@ -617,16 +617,14 @@ def context_attention_fwd(q, b_seq_len, b_ctx_len, max_input_len, - alibi_slopes=None, - sm_scale=None): + alibi_slopes=None): BLOCK = 128 # shape constraints Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1] assert Lq == Lk and Lk == Lv assert Lk in {16, 32, 64, 128} - if sm_scale is None: - sm_scale = 1.0 / (Lq**0.5) + sm_scale = 1.0 / (Lq**0.5) batch, head = b_seq_len.shape[0], q.shape[1] grid = (batch, head, triton.cdiv(max_input_len, BLOCK)) # batch, head, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 3793a148d9f68..cd10653b8c1ff 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -359,7 +359,7 @@ def load_weights(self, default_weight_loader) weight_loader(param, loaded_weight) - # Should not be called unless the KV cache dtype is FP8 + # Should not be called unless the KV cache dtype is FP8 on ROCm (AMD GPU) def load_kv_cache_scales(self, filename: str) -> None: for layer_idx, scaling_factor in kv_cache_scales_iterator(filename): layer_paged_attn = self.model.layers[layer_idx].self_attn.attn diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 5e5eb0d521ef7..f57e1ed75803d 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -138,8 +138,8 @@ def swap_out(self, src_to_dst: Dict[int, int]) -> None: self._swap(self.gpu_cache, self.cpu_cache, src_to_dst) def copy(self, src_to_dsts: Dict[int, List[int]]) -> None: - key_caches = [key_cache for key_cache, _, _ in self.gpu_cache] - value_caches = [value_cache for _, value_cache, _ in self.gpu_cache] + key_caches = [key_cache for key_cache, _ in self.gpu_cache] + value_caches = [value_cache for _, value_cache in self.gpu_cache] # NOTE(woosuk): This operation implicitly synchronizes the CPU and GPU. cache_ops.copy_blocks(key_caches, value_caches, src_to_dsts) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 551c25be320ca..c4a48183ccc55 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -84,23 +84,21 @@ def load_model(self) -> None: self.lora_config, self.device) self.model = self.lora_manager.create_lora_manager(self.model) - if self.model_config.kv_cache_scales is not None: + if self.model_config.kv_cache_scales_path is not None: if self.kv_cache_dtype == "fp8": if callable(getattr(self.model, "load_kv_cache_scales", None)): - self.model.load_kv_cache_scales(self.model_config.kv_cache_scales) + self.model.load_kv_cache_scales(self.model_config.kv_cache_scales_path) else: logger.warn("Using FP8 KV cache and scaling factors provided but " - f"model {self.model.__class__} does not support " - "loading scaling factors. Defaulting to 1.0 scales, " - "be warned that this might lead to inaccurate " - "results!") + f"model {self.model.__class__} does not support loading " + "scaling factors. Defaulting to scaling factors of 1.0, " + "This may lead to less accurate results!") else: - logger.warn("User provided KV cache scaling factors but these will " - "not be used as the KV cache dtype is not FP8!") + logger.warn("KV cache scaling factors provided, but the KV cache data type is not FP8. " + "KV cache scaling factors will not be used.") elif self.kv_cache_dtype == "fp8": - logger.warn(f"Using FP8 KV cache but no scaling factors provided. " - "Defaulting to 1.0 scales, be warned that this might " - "lead to inaccurate results!") + logger.warn(f"Using FP8 KV cache but no scaling factors provided. Defaulting to " + "scaling factors of 1.0, This may lead to less accurate results!") def set_block_size(self, block_size: int) -> None: self.block_size = block_size From 7b26ec9ca03c09ca414e3c5f7ac3770dfa35a942 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Tue, 20 Feb 2024 17:02:35 +0000 Subject: [PATCH 033/159] Fix typos, add a few more sanity checks to the KV cache scales loader, remove PT support from scales extraction utility --- 3rdparty/quantizer/extract_scales.py | 5 +++-- benchmarks/benchmark_latency.py | 2 +- benchmarks/benchmark_throughput.py | 5 ++--- vllm/engine/arg_utils.py | 2 +- vllm/model_executor/weight_utils.py | 4 ++++ 5 files changed, 11 insertions(+), 7 deletions(-) diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py index ace05e52a69b8..ca1d41dcd40cb 100644 --- a/3rdparty/quantizer/extract_scales.py +++ b/3rdparty/quantizer/extract_scales.py @@ -25,7 +25,8 @@ def main(args): for name, param in hf_model_weights_iterator(args.model, args.cache_dir, args.load_format, - args.revision): + args.revision, + fall_back_to_pt=False): if "kv_cache_scaling_factor" in name: nums = [int(s) for s in name.split('.') if s.isdigit()] assert len(nums) == 1, f"Could not determine layer idx for {name}!" @@ -67,7 +68,7 @@ def main(args): parser.add_argument("--load_format", help="Optionally specify the format of the model's tensor files " "containing the KV cache scaling factors.", - choices=["auto", "safetensors", "pt", "npcache"], + choices=["auto", "safetensors", "npcache"], default="auto") parser.add_argument("--revision", help="Optionally specify the model's revision number.", diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index d2f520c39a91b..c365f08b021ad 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -134,7 +134,7 @@ def run_to_completion(profile_dir: Optional[str] = None): help='Path to the JSON files containing the KV cache scaling factors. ' 'This should generally be supplied, when KV cache dtype is FP8. Otherwise, ' 'KV cache scaling factors default to 1.0, which may cause accuracy issues. ' - 'FP8_E5M2 (withour scaling) is only supported on cuda version greater than 11.8. ' + 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') parser.add_argument( '--profile', diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 902e2f4001c47..ae37e356cda9a 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -108,9 +108,8 @@ def run_vllm( start = time.perf_counter() # FIXME(woosuk): Do not use internal method. - outputs = llm._run_engine(use_tqdm=True) + llm._run_engine(use_tqdm=True) end = time.perf_counter() - print(outputs[-1]) return end - start @@ -305,7 +304,7 @@ def main(args: argparse.Namespace): help='Path to the JSON files containing the KV cache scaling factors. ' 'This should generally be supplied, when KV cache dtype is FP8. Otherwise, ' 'KV cache scaling factors default to 1.0, which may cause accuracy issues. ' - 'FP8_E5M2 (withour scaling) is only supported on cuda version greater than 11.8. ' + 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') args = parser.parse_args() if args.tokenizer is None: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b03c39858fb22..df75405269662 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -139,7 +139,7 @@ def add_cli_args( help='Path to the JSON files containing the KV cache scaling factors. ' 'This should generally be supplied, when KV cache dtype is FP8. Otherwise, ' 'KV cache scaling factors default to 1.0, which may cause accuracy issues. ' - 'FP8_E5M2 (withour scaling) is only supported on cuda version greater than 11.8. ' + 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') parser.add_argument('--max-model-len', type=int, diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index b753c74f1a57e..d5c77de5b361d 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -273,6 +273,10 @@ def kv_cache_scales_iterator(filename: str) -> Iterator[Tuple[int, torch.Tensor] """ with open(filename) as f: layer_scale_factor_map = json.load(f, parse_int=int, parse_constant=float) + if not isinstance(layer_scale_factor_map, dict) or \ + len(layer_scale_factor_map) == 0: + raise RuntimeError(f"File '{filename}' does not specify a valid " + "layer:scale_factor map.") for layer_idx, scale_factor in layer_scale_factor_map.items(): yield int(layer_idx), float(scale_factor) From 936821e9175e96188c75467f81839c361ee47d9c Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Tue, 20 Feb 2024 19:21:27 +0000 Subject: [PATCH 034/159] Add additional checks to the scaling factor loader and fail gracefully on errors --- vllm/model_executor/weight_utils.py | 38 +++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index d5c77de5b361d..5b41d94f178a7 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -271,14 +271,36 @@ def kv_cache_scales_iterator(filename: str) -> Iterator[Tuple[int, torch.Tensor] itself. Keep this function in sync with the output of 3rdparty/quantization/extract_scales.py """ - with open(filename) as f: - layer_scale_factor_map = json.load(f, parse_int=int, parse_constant=float) - if not isinstance(layer_scale_factor_map, dict) or \ - len(layer_scale_factor_map) == 0: - raise RuntimeError(f"File '{filename}' does not specify a valid " - "layer:scale_factor map.") - for layer_idx, scale_factor in layer_scale_factor_map.items(): - yield int(layer_idx), float(scale_factor) + try: + with open(filename) as f: + # For now we do not obtain any of the benefits of iterators + # but since the number of layers = number of scales is typically + # small, this is not a concern. Loading and processing the entire + # dictionary at once allows us to do sanity checks all at once and + # avoid a situation where we have to abort after having partially + # loaded scaling factors + raw_map = json.load(f, parse_int=int, parse_constant=float) + if not isinstance(raw_map, dict) or len(raw_map) == 0: + raise RuntimeError(f"File '{filename}' does not specify a valid " + "layer:scale_factor map.") + # If any of the inputs are malformed, it will raise an error here and + # be caught in except + layer_scales_map = {int(layer_idx): float(scale) + for layer_idx, scale in raw_map.items()} + return layer_scales_map.items() + + except FileNotFoundError: + logger.error(f"File '{filename}' not found.") + except json.JSONDecodeError: + logger.error(f"Error decoding JSON in file '{filename}'.") + except Exception as e: + logger.error(f"An error occurred while reading file '{filename}': {e}") + # This section is only reached if any of the excepts are hit + # Return an empty iterator (tuple) => no KV cache scales are loaded + # which effectively defaults to 1.0 scales + logger.warn("Defaulting to KV cache scaling factors = 1.0 as an error " + "occurred while trying to load them from file.") + return () def convert_pyslice_to_tensor(x: Any) -> torch.Tensor: From 74b2b3fb05759513863eae2489d29a7c7cc3b312 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Tue, 20 Feb 2024 19:33:41 +0000 Subject: [PATCH 035/159] Remove lingering PT fallback in extraction utility --- 3rdparty/quantizer/extract_scales.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py index ca1d41dcd40cb..63104bfb32be9 100644 --- a/3rdparty/quantizer/extract_scales.py +++ b/3rdparty/quantizer/extract_scales.py @@ -15,7 +15,8 @@ def main(args): hf_folder, _, _ = prepare_hf_model_weights(args.model, args.cache_dir, args.load_format, - revision=args.revision) + revision=args.revision, + fall_back_to_pt=False) output_file = os.path.join(hf_folder, default_output_name) else: output_file = os.path.join(args.output, default_output_name) From 2c7ce967cfdbc4d68a487e5d203e05a963c59b57 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Tue, 20 Feb 2024 19:44:04 +0000 Subject: [PATCH 036/159] Add ROCm clarification to extract scales script --- 3rdparty/quantizer/extract_scales.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py index 63104bfb32be9..a94c4d7d379c9 100644 --- a/3rdparty/quantizer/extract_scales.py +++ b/3rdparty/quantizer/extract_scales.py @@ -55,8 +55,8 @@ def main(args): "and saves them to a JSON file compatible with later " "use by vLLM (pass this file to the appropriate " "runtime typically using the argument " - "--kv-cache-scales ). This is only used " - "if the KV cache dtype is FP8.") + "--kv-cache-scales-path ). This is only used " + "if the KV cache dtype is FP8 and on ROCm (AMD GPU).") parser.add_argument("--model", help="Specify either a directory or name of a HF model. If the model " "does not exist, this utility will attempt to download said model " From 763b2837453de1aa1ca19812fc492f40df0d0a68 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Wed, 21 Feb 2024 21:12:59 +0000 Subject: [PATCH 037/159] Preliminary TP rank > 1 extraction and loading support --- 3rdparty/quantizer/extract_scales.py | 232 ++++++++++++++++++++++----- vllm/model_executor/models/llama.py | 7 +- vllm/model_executor/weight_utils.py | 44 +++-- 3 files changed, 224 insertions(+), 59 deletions(-) diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py index a94c4d7d379c9..3c54222bb41d2 100644 --- a/3rdparty/quantizer/extract_scales.py +++ b/3rdparty/quantizer/extract_scales.py @@ -1,52 +1,195 @@ import argparse +import fnmatch +import glob +from huggingface_hub import snapshot_download, HfFileSystem import json +import numpy as np import os -from vllm.model_executor.weight_utils import ( - hf_model_weights_iterator, - prepare_hf_model_weights -) +from safetensors.torch import safe_open +import torch +from typing import List, Optional, Tuple -default_output_name = "kv_cache_scales.json" + +# Adapted from vllm/model_executor/weight_utils.py +# The main differences are that we add the NPZ format and that there's no +# need for a file lock when downloading model weights because this tool is +# not intended to be run on multiple processes simultaneously. +# Since our use case is sufficiently different, we define our own function +# here. +def _prepare_hf_weights( + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + fall_back_to_pt: bool = True, + revision: Optional[str] = None, +) -> Tuple[str, List[str], bool]: + # Download model weights from huggingface. + is_local = os.path.isdir(model_name_or_path) + use_safetensors = False + # Some quantized models use .pt files for storing the weights. + if load_format == "auto": + allow_patterns = ["*.safetensors", "*.bin"] + elif load_format == "safetensors": + use_safetensors = True + allow_patterns = ["*.safetensors"] + elif load_format == "pt": + allow_patterns = ["*.pt"] + elif load_format == "npz": + allow_patterns = ["*.npz"] + else: + raise ValueError(f"Unknown load_format: {load_format}") + + if fall_back_to_pt: + allow_patterns += ["*.pt"] + + if not is_local: + # Before we download we look at that is available: + fs = HfFileSystem() + file_list = fs.ls(model_name_or_path, detail=False, revision=revision) + # depending on what is available we download different things + for pattern in allow_patterns: + matching = fnmatch.filter(file_list, pattern) + if len(matching) > 0: + allow_patterns = [pattern] + break + print(f"Downloading model... Using model weights format {allow_patterns}") + hf_folder = snapshot_download(model_name_or_path, + allow_patterns=allow_patterns, + cache_dir=cache_dir, + revision=revision) + else: + hf_folder = model_name_or_path + hf_weights_files: List[str] = [] + for pattern in allow_patterns: + hf_weights_files += glob.glob(os.path.join(hf_folder, pattern)) + if len(hf_weights_files) > 0: + if pattern == "*.safetensors": + use_safetensors = True + break + if not use_safetensors: + # Exclude files that are not needed for inference. + # https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233 + blacklist = [ + "training_args.bin", + "optimizer.bin", + "optimizer.pt", + "scheduler.pt", + "scaler.pt", + ] + hf_weights_files = [ + f for f in hf_weights_files + if not any(f.endswith(x) for x in blacklist) + ] + + if len(hf_weights_files) == 0: + raise RuntimeError( + f"Cannot find any model weights with `{model_name_or_path}`") + + return hf_folder, hf_weights_files, use_safetensors +# Adapted from vllm/model_executor/weight_utils.py +def _hf_tensorfile_iterator(filename: str, load_format: str, + use_safetensors: bool): + if load_format == "npz": + assert not use_safetensors + with np.load(filename) as data: + for name in data.files: + param = torch.from_numpy(data[name]) + yield name, param + elif use_safetensors: + with safe_open(filename, framework="pt") as f: + for name in f.keys(): + param = f.get_tensor(name) + yield name, param + else: + state = torch.load(filename, map_location="cpu") + for name, param in state.items(): + yield name, param + del state + torch.cuda.empty_cache() + + +# Used by both main and if __name__ == "__main__" +_default_kvcache_scales_filename = "kv_cache_scales.json" + def main(args): - layer_scale_factors_map = {} - if args.output is None: - hf_folder, _, _ = prepare_hf_model_weights(args.model, - args.cache_dir, - args.load_format, - revision=args.revision, - fall_back_to_pt=False) - output_file = os.path.join(hf_folder, default_output_name) + rank_tensors_map = {} + hf_folder, hf_tensor_files, use_safetensors = _prepare_hf_weights( + args.model, + args.cache_dir, + args.load_format, + revision=args.revision, + fall_back_to_pt=True) + # Matches the number immediately after this keyword in the tensor filename to + # determine the TP rank corresponding to said tensor file + rank_keyword = "rank" + for tensor_file in hf_tensor_files: + try: + rank_idx = tensor_file.find(rank_keyword) + if rank_idx != -1: + start_idx = rank_idx + len(rank_keyword) + stop_idx = start_idx + while stop_idx < len(tensor_file) and tensor_file[stop_idx].isdecimal(): + stop_idx += 1 + if stop_idx == start_idx: + raise RuntimeError("Did not find rank # in filename.") + rank = int(tensor_file[start_idx:stop_idx]) + elif len(hf_tensor_files) == 1: + # Since there is only one tensor file, we can assume + # that it's intended for TP rank 0 + rank = 0 + else: + raise RuntimeError(f"Filename does not contain '{rank_keyword}'.") + except RuntimeError: + print("Unable to determine TP rank " + f"corresponding to file '{tensor_file}'") + raise + + if rank not in rank_tensors_map: + layer_scales_map = {} + rank_tensors_map[rank] = layer_scales_map + else: + raise RuntimeError(f"Tensor file '{tensor_file}' shares TP rank {rank} " + "with another tensor file.") + + module_delimiter = ":" if args.load_format == "npz" else "." + for name, param in _hf_tensorfile_iterator(tensor_file, args.load_format, + use_safetensors): + if "kv_cache_scaling_factor" in name: + nums = [int(s) for s in name.split(module_delimiter) if s.isdigit()] + assert len(nums) == 1, f"Could not determine layer idx for {name}" + layer_idx = nums[0] + assert layer_idx not in layer_scales_map, f"Duplicate scaling " \ + f"factor corresponding to layer {layer_idx}" + try: + layer_scales_map[layer_idx] = param.item() + except RuntimeError: + print("This utility supports only per-tensor scalar scale factors " + f"for now. The tensor\n {name} = {param} is an invalid " + "scale factor.") + raise + + if args.output_path is None: + output_file = os.path.join(hf_folder, _default_kvcache_scales_filename) else: - output_file = os.path.join(args.output, default_output_name) - if not os.path.isdir(args.output): - os.makedirs(args.output, exist_ok=True) - - for name, param in hf_model_weights_iterator(args.model, - args.cache_dir, - args.load_format, - args.revision, - fall_back_to_pt=False): - if "kv_cache_scaling_factor" in name: - nums = [int(s) for s in name.split('.') if s.isdigit()] - assert len(nums) == 1, f"Could not determine layer idx for {name}!" - layer_idx = nums[0] - assert layer_idx not in layer_scale_factors_map, f"Duplicate scaling " \ - f"factor corresponding to layer {layer_idx}!" - try: - layer_scale_factors_map[layer_idx] = param.item() - except RuntimeError: - print("This utility supports only per-tensor scalar scale factors " - f"for now. The tensor\n {name} = {param} is an invalid " - "scale factor!") - raise - if len(layer_scale_factors_map) == 0: - print("WARNING: No KV cache scale factors found! No output saved.") + output_file = os.path.join(args.output_path, _default_kvcache_scales_filename) + if not os.path.isdir(args.output_path): + os.makedirs(args.output_path, exist_ok=True) + + if (len(rank_tensors_map) == 0 or + all(len(layer_scales_map) == 0 for layer_scales_map in rank_tensors_map.values())): + print("WARNING: No KV cache scale factors found. No output saved.") else: + tp_world_size = max(rank_tensors_map.keys()) + 1 + for i in range(tp_world_size): + assert i in rank_tensors_map, f"Expected TP world size = {tp_world_size} " \ + "but did not find KV cache scaling factors " \ + f"for TP rank {i}" with open(output_file, 'w') as f: - json.dump(layer_scale_factors_map, f, sort_keys=True) - print(f"Completed! KV cache scaling factors saved to {output_file}") + json.dump(rank_tensors_map, f, sort_keys=True, indent=4) + print(f"Completed! Found TP world size = {tp_world_size}.", + f"KV cache scaling factors saved to {output_file}") if __name__ == "__main__": @@ -69,15 +212,16 @@ def main(args): parser.add_argument("--load_format", help="Optionally specify the format of the model's tensor files " "containing the KV cache scaling factors.", - choices=["auto", "safetensors", "npcache"], + choices=["auto", "safetensors", "npz", "pt"], default="auto") parser.add_argument("--revision", help="Optionally specify the model's revision number.", default=None) - parser.add_argument("--output", - help="Specify the output directory. By default it will be saved in " - f"the model directory with the filename {default_output_name}, " - "however you can override this behavior here.", + parser.add_argument("--output_path", + help="Optionally specify the output directory. By default the " + "scaling factors will be saved in the model directory with the " + f"filename {_default_kvcache_scales_filename}, however you can " + "override this behavior here.", default=None) args = parser.parse_args() diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index cd10653b8c1ff..eafb6d247342d 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -40,6 +40,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE) from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.weight_utils import (default_weight_loader, @@ -361,6 +362,10 @@ def load_weights(self, # Should not be called unless the KV cache dtype is FP8 on ROCm (AMD GPU) def load_kv_cache_scales(self, filename: str) -> None: - for layer_idx, scaling_factor in kv_cache_scales_iterator(filename): + tp_size = get_tensor_model_parallel_world_size() + tp_rank = get_tensor_model_parallel_rank() + for layer_idx, scaling_factor in kv_cache_scales_iterator( + filename, tp_rank, tp_size, + self.model.config.num_hidden_layers): layer_paged_attn = self.model.layers[layer_idx].self_attn.attn layer_paged_attn.kv_cache_scaling_factor = scaling_factor diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index 5b41d94f178a7..3d8a09d9b9035 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -262,14 +262,18 @@ def hf_model_weights_iterator( torch.cuda.empty_cache() -def kv_cache_scales_iterator(filename: str) -> Iterator[Tuple[int, torch.Tensor]]: +def kv_cache_scales_iterator(filename: str, + tp_rank: int, + tp_size: int, + num_hidden_layers: int) -> Iterator[Tuple[int, torch.Tensor]]: """ A simple utility to read in KV cache scaling factors that have been previously serialized to disk. Used by the model to populate the appropriate - KV cache scaling factors. The first object of the pair is the cache (and model) - layer corresponding to the scaling factor, and the second is the scaling factor - itself. Keep this function in sync with the output of - 3rdparty/quantization/extract_scales.py + KV cache scaling factors. The serialization should represent a dictionary + whose keys are the TP ranks and values are another dictionary mapping layers + to their KV cache scaling factors. + Keep this function in sync with the output + of 3rdparty/quantization/extract_scales.py """ try: with open(filename) as f: @@ -279,14 +283,26 @@ def kv_cache_scales_iterator(filename: str) -> Iterator[Tuple[int, torch.Tensor] # dictionary at once allows us to do sanity checks all at once and # avoid a situation where we have to abort after having partially # loaded scaling factors - raw_map = json.load(f, parse_int=int, parse_constant=float) - if not isinstance(raw_map, dict) or len(raw_map) == 0: - raise RuntimeError(f"File '{filename}' does not specify a valid " - "layer:scale_factor map.") - # If any of the inputs are malformed, it will raise an error here and - # be caught in except + raw_rank_map = json.load(f, parse_int=int, parse_constant=float) + + # If any of the inputs are malformed, it raises an error somewhere + # in the following lines and is caught in except + assert isinstance(raw_rank_map, dict), "Did not load a dictionary from file." + assert len(raw_rank_map) != 0, "Loaded dictionary is empty." + for rank, scales_map in raw_rank_map.items(): + assert len(scales_map) == num_hidden_layers, "KV cache scales map for TP rank " \ + f"{rank} is malformed. Expected {num_hidden_layers} layers, got {len(scales_map)}." + for i in range(tp_size): + assert i in raw_rank_map or str(i) in raw_rank_map, "KV cache scales map for TP rank " \ + f"{i} not found." + assert tp_rank in raw_rank_map or str(tp_rank) in raw_rank_map, "Tried to load KV cache " \ + f"scales for TP rank {tp_rank} but these were not found." + raw_layer_scales_map = raw_rank_map.get(tp_rank) or raw_rank_map.get(str(tp_rank)) layer_scales_map = {int(layer_idx): float(scale) - for layer_idx, scale in raw_map.items()} + for layer_idx, scale in raw_layer_scales_map.items()} + for i in range(num_hidden_layers): + assert i in layer_scales_map, "Could not find KV cache scales for layer " \ + f"{i} in TP rank {tp_rank}." return layer_scales_map.items() except FileNotFoundError: @@ -298,8 +314,8 @@ def kv_cache_scales_iterator(filename: str) -> Iterator[Tuple[int, torch.Tensor] # This section is only reached if any of the excepts are hit # Return an empty iterator (tuple) => no KV cache scales are loaded # which effectively defaults to 1.0 scales - logger.warn("Defaulting to KV cache scaling factors = 1.0 as an error " - "occurred while trying to load them from file.") + logger.warn(f"Defaulting to KV cache scaling factors = 1.0 for all layers in TP rank {tp_rank}" + " as an error occurred during loading.") return () From ef2671641ce85736ca2a75480d6e71030d276eab Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Wed, 21 Feb 2024 21:30:01 +0000 Subject: [PATCH 038/159] Ensure loaded dictionary has same TP size as currently running engine --- vllm/model_executor/weight_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index 3d8a09d9b9035..85441281eb4a9 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -289,6 +289,9 @@ def kv_cache_scales_iterator(filename: str, # in the following lines and is caught in except assert isinstance(raw_rank_map, dict), "Did not load a dictionary from file." assert len(raw_rank_map) != 0, "Loaded dictionary is empty." + loaded_tp_size = max(int(rank) for rank in raw_rank_map) + 1 + assert loaded_tp_size == tp_size, f"Loaded dictionary has TP size {loaded_tp_size} " \ + f"but LLM engine is currently running with TP size {tp_size}." for rank, scales_map in raw_rank_map.items(): assert len(scales_map) == num_hidden_layers, "KV cache scales map for TP rank " \ f"{rank} is malformed. Expected {num_hidden_layers} layers, got {len(scales_map)}." From 0e73aed90ef2d2c8bf7de0cd7e4aeb90a98ef3b9 Mon Sep 17 00:00:00 2001 From: Doug Lehr Date: Thu, 15 Feb 2024 06:23:50 +0000 Subject: [PATCH 039/159] Fix Dockerfile errors --- Dockerfile.rocm | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 08783e0a1e0bd..36fac166ae44b 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -18,14 +18,6 @@ RUN git clone -b develop https://github.com/ROCmSoftwarePlatform/hipBLASLt \ RUN sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status RUN sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status -RUN git clone https://streamhsa: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@github.com/rocm/rocBLAS-internal.git \ - && export GTest_DIR="/usr/local/lib/cmake/GTest/" \ - && cd rocBLAS-internal \ - && git fetch origin 4f353a8035da38c8b8873823c09a499db777b231 \ - && git checkout 4f353a8035da38c8b8873823c09a499db777b231 \ - && ./install.sh -idc -a 'gfx90a;gfx942' \ - && cd ../ && rm -rf rocBLAS-internal - RUN pip uninstall -y triton RUN git clone https://github.com/ROCmSoftwarePlatform/triton.git \ && cd triton/python && pip3 install -e . @@ -52,5 +44,4 @@ RUN git clone https://github.com/ROCmSoftwarePlatform/rocmProfileData.git \ COPY docker/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6.0.60000 COPY docker/libfile_plugin.so /opt/rocm/lib/roctracer -COPY docker/run_13b.sh $WORKSPACE_DIR/ -COPY docker/run_70b.sh $WORKSPACE_DIR/ +ENV WORKSPACE_DIR=/workspace/vllm From 90df0c93877438ba427721a73bf2af1338e0da68 Mon Sep 17 00:00:00 2001 From: Doug Lehr Date: Fri, 16 Feb 2024 04:48:03 +0000 Subject: [PATCH 040/159] Add llama2 run script --- run_llama2.sh | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100755 run_llama2.sh diff --git a/run_llama2.sh b/run_llama2.sh new file mode 100755 index 0000000000000..1444ca7d222a1 --- /dev/null +++ b/run_llama2.sh @@ -0,0 +1,98 @@ +#!/bin/bash +BASE_DIR=/workspace +VLLM_DIR=$BASE_DIR/vllm +GRAD_DIR=$VLLM_DIR/gradlib +RPD_DIR=/workspace/rocmProfileData +MODEL=/data/llama2-70b-chat +MODEL_SIZE=`echo $MODEL | sed 's/.*\(.[0-9][bB]\).*/\1/'` +#MODEL=/data/llama-2-13b-chat-hf +GEMM_TUNER=1 +#TP="1 2 4 8" +TP=8 +#Flag to use Triton Flash Attention vs CK +#export VLLM_USE_TRITON=1 + +#Gemm tuner flags +export VLLM_TUNE_GEMM=0 +export VLLM_UNTUNE_FILE="/tmp/vllm_untuned.csv" +export VLLM_TUNE_FILE=$VLLM_DIR"/tuned.csv" + +#Flag to use old torch.multinomial +#export VLLM_USE_TORCH_MULTINOMIAL=1 + +#Delete tuned gemms before running. +#DELETE_TUNED_CSV=1 +#Flag to disable MSCCL +#export RCCL_MSCCL_ENABLE=0 +#HIPGraph performance flags +export HIP_FORCE_DEV_KERNARG=1 +export DEBUG_CLR_GRAPH_PACKET_CAPTURE=1 +#Enable full decoder graph mode +#Use top of tree build of RCCL +export LD_LIBRARY_PATH=/workspace/rccl/build/ +#Enable either flag to create a profile trace (rocprof, or rocpd) +#RPD_PROFILE="--rpd" +#ROCPROF_PROFILE="rocprof --hip-trace" +GEN_LEN="1,32,128" +INPUT_LEN="512,1024,2048,3072" + +ITER=10 +# pring usage of the parameters +usage() { + echo "Usage: $0 [--tp ] [--model ]" + exit 1 +} +# parse parameters +while [[ "$#" -gt 0 ]]; do + case $1 in + --tp) TP="$2"; shift ;; + --model) MODEL="$2"; shift ;; + --notune) GEMM_TUNER=0; shift ;; + *) usage ;; # Any other argument will show usage information. + esac + shift # Move to next argument +done +for tp in $TP; +do + if (( $GEMM_TUNER )); + then + echo "tuned_gemm_csv: "$VLLM_TUNE_FILE > $VLLM_DIR/tuned_perf_tp$tp.yaml + + if [[ $DELETE_TUNED_CSV == 1 ]]; + then + rm -rf $VLLM_TUNE_FILE + fi + #export VLLM_PERF_YAML=./tuned_perf_tp$tp.yaml + echo "INFO: Generating Tuned Gemm configs" + cd $GRAD_DIR + python gemm_tuner.py --model_dir $MODEL --tuned_file $VLLM_TUNE_FILE --tp $tp + fi + + cd $VLLM_DIR + for gen_len in $GEN_LEN; + do + for input_len in $INPUT_LEN; + do + if [[ -v RPD_PROFILE ]] ; + then + rm /workspace/trace.rpd + python -m rocpd.schema --create /workspace/trace.rpd + fi + echo "================================= RUNNING $MODEL $input_len $gen_len ===============================================" + $ROCPROF_PROFILE torchrun --standalone --nnodes=1 --nproc-per-node=$tp benchmarks/benchmark_latency.py --model $MODEL --batch-size 1 --input-len $input_len --output-len $gen_len \ + --tensor-parallel-size $tp --num-iters $ITER $RPD_PROFILE --report + if [[ -v ROCPROF_PROFILE ]] ; + then + TRACE_FILE=$BASE_DIR/trace_${MODEL_SIZE}_${input_len}_${gen_len}.json + echo "INFO: Creating Trace JSON file $TRACE_FILE" + mv $VLLM_DIR/results.json $TRACE_FILE + fi + if [[ -v RPD_PROFILE ]] ; + then + TRACE_FILE=$BASE_DIR/trace_${MODEL_SIZE}_${input_len}_${gen_len}.json + echo "INFO: Creating Trace JSON file $TRACE_FILE" + python $RPD_DIR/tools/rpd2tracing.py --format object $BASE_DIR/trace.rpd $TRACE_FILE + fi + done + done +done From ab672805783edd129fbafa7fc4854874e2bcef64 Mon Sep 17 00:00:00 2001 From: Doug Lehr Date: Fri, 16 Feb 2024 21:31:25 +0000 Subject: [PATCH 041/159] Increase Partition and Num threads for attention blocks --- csrc/attention/attention_kernels.cu | 7 ++++--- vllm/model_executor/layers/attention.py | 5 ++++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index 9dcacfbe47d48..728ed64eab3d6 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -37,6 +37,7 @@ #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b)) + namespace vllm { // Utility function for attention softmax. @@ -572,7 +573,7 @@ __global__ void paged_attention_v2_reduce_kernel( template< typename T, int BLOCK_SIZE, - int NUM_THREADS = 128> + int NUM_THREADS = 1024> void paged_attention_v1_launcher( torch::Tensor& out, torch::Tensor& query, @@ -731,8 +732,8 @@ void paged_attention_v1( template< typename T, int BLOCK_SIZE, - int NUM_THREADS = 128, - int PARTITION_SIZE = 512> + int NUM_THREADS = 1024, + int PARTITION_SIZE = 1024> void paged_attention_v2_launcher( torch::Tensor& out, torch::Tensor& exp_sums, diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index 8b5c6ab30d7b7..f05b49d494958 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -16,8 +16,11 @@ _SUPPORTED_HEAD_SIZES = [64, 80, 96, 112, 128, 256] # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. -_PARTITION_SIZE = 512 +if is_hip: + _PARTITION_SIZE = 1024 +else: + _PARTITION_SIZE = 512 class PagedAttention(nn.Module): """MHA/MQA/GQA layer with PagedAttention. From 1d53722476833033b4e580260944322fa119d52f Mon Sep 17 00:00:00 2001 From: Doug Lehr Date: Thu, 22 Feb 2024 02:15:22 +0000 Subject: [PATCH 042/159] Fix WORKDIR --- Dockerfile.rocm | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 36fac166ae44b..873574c409a8f 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -42,6 +42,4 @@ RUN pip install pyarrow Ray pandas==2.0 numpy==1.20.3 RUN git clone https://github.com/ROCmSoftwarePlatform/rocmProfileData.git \ && cd rocmProfileData && make; make install -COPY docker/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6.0.60000 -COPY docker/libfile_plugin.so /opt/rocm/lib/roctracer -ENV WORKSPACE_DIR=/workspace/vllm +WORKDIR /workspace/vllm From 5148aa59c111d60d5d2b8bdb4c59c7676221cb12 Mon Sep 17 00:00:00 2001 From: Doug Lehr Date: Thu, 22 Feb 2024 02:41:09 +0000 Subject: [PATCH 043/159] Add accuracy flag to benchmark_latency.py --- benchmarks/benchmark_latency.py | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index e4d70851e46ef..be1dd223818fc 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -46,6 +46,8 @@ def main(args: argparse.Namespace): ) print(sampling_params) dummy_prompt_token_ids = [[0] * input_len] * batch_size + dummy_prompts = [] + dummy_prompts.append('DeepSpeed is a machine learning library that deep learning practitioners should use for what purpose') def run_to_completion(profile_dir: Optional[str] = None): if profile_dir: @@ -60,13 +62,25 @@ def run_to_completion(profile_dir: Optional[str] = None): sampling_params=sampling_params, use_tqdm=False) print(p.key_averages()) + elif args.accuracy: + start_time = time.perf_counter() + rsp = llm.generate( + #prompt_token_ids=dummy_prompt_token_ids, + prompts=dummy_prompts, + sampling_params=sampling_params, + use_tqdm=False) + end_time = time.perf_counter() + latency = end_time - start_time + print('>>Rsp', rsp[0].outputs) + return latency else: start_time = time.perf_counter() - llm.generate(prompt_token_ids=dummy_prompt_token_ids, + rsp = llm.generate(prompt_token_ids=dummy_prompt_token_ids, sampling_params=sampling_params, use_tqdm=False) end_time = time.perf_counter() latency = end_time - start_time + print('>>Rsp', rsp[0].outputs) return latency print("Warming up...") @@ -92,7 +106,8 @@ def run_to_completion(profile_dir: Optional[str] = None): profile_rpd = rpdTracerControl() profile_rpd.start() print(f"RPD Profiling'...") - run_to_completion(profile_dir=None) + with torch.autograd.profiler.emit_nvtx(): + run_to_completion(profile_dir=None) profile_rpd.stop() return @@ -140,6 +155,7 @@ def run_to_completion(profile_dir: Optional[str] = None): parser.add_argument('--trust-remote-code', action='store_true', help='trust remote code from huggingface') + parser.add_argument( '--dtype', type=str, @@ -152,6 +168,9 @@ def run_to_completion(profile_dir: Optional[str] = None): parser.add_argument('--enforce-eager', action='store_true', help='enforce eager mode and disable CUDA graph') + parser.add_argument('--accuracy', + action='store_true', + help='Run an Actual query through vllm') parser.add_argument( '--profile', action='store_true', @@ -162,6 +181,10 @@ def run_to_completion(profile_dir: Optional[str] = None): default=None, help=('path to save the pytorch profiler output. Can be visualized ' 'with ui.perfetto.dev or Tensorboard.')) + parser.add_argument( + '--rpd', + action='store_true', + help='profile the generation process of a single batch using the rpd tracer') parser.add_argument('--warmup-only', action='store_true', help='only run warmup, useful for tuning') parser.add_argument('--report', action='store_true', From c7e2587507d76a8f55ea744a891d554adfc2491a Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Fri, 23 Feb 2024 00:48:55 +0000 Subject: [PATCH 044/159] Add tp_size argument for user to specify TP size to expect in quantized model. Renamed --model argument to --quantized_model to be explicit. --- 3rdparty/quantizer/extract_scales.py | 43 ++++++++++++++++++---------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py index 3c54222bb41d2..eebbd88a62d13 100644 --- a/3rdparty/quantizer/extract_scales.py +++ b/3rdparty/quantizer/extract_scales.py @@ -116,7 +116,7 @@ def _hf_tensorfile_iterator(filename: str, load_format: str, def main(args): rank_tensors_map = {} hf_folder, hf_tensor_files, use_safetensors = _prepare_hf_weights( - args.model, + args.quantized_model, args.cache_dir, args.load_format, revision=args.revision, @@ -181,14 +181,18 @@ def main(args): all(len(layer_scales_map) == 0 for layer_scales_map in rank_tensors_map.values())): print("WARNING: No KV cache scale factors found. No output saved.") else: - tp_world_size = max(rank_tensors_map.keys()) + 1 - for i in range(tp_world_size): - assert i in rank_tensors_map, f"Expected TP world size = {tp_world_size} " \ + empirical_tp_world_size = max(rank_tensors_map.keys()) + 1 + if args.tp_size is not None: + assert args.tp_size == empirical_tp_world_size, "User expected TP world size = " \ + f"{args.tp_size} but expecting TP world size = {empirical_tp_world_size} from " \ + "model instead." + for i in range(empirical_tp_world_size): + assert i in rank_tensors_map, f"Expected TP world size = {empirical_tp_world_size} " \ "but did not find KV cache scaling factors " \ f"for TP rank {i}" with open(output_file, 'w') as f: json.dump(rank_tensors_map, f, sort_keys=True, indent=4) - print(f"Completed! Found TP world size = {tp_world_size}.", + print(f"Completed! Found TP world size = {empirical_tp_world_size}.", f"KV cache scaling factors saved to {output_file}") @@ -198,16 +202,14 @@ def main(args): "and saves them to a JSON file compatible with later " "use by vLLM (pass this file to the appropriate " "runtime typically using the argument " - "--kv-cache-scales-path ). This is only used " + "--kv_cache_scales_path ). This is only used " "if the KV cache dtype is FP8 and on ROCm (AMD GPU).") - parser.add_argument("--model", - help="Specify either a directory or name of a HF model. If the model " - "does not exist, this utility will attempt to download said model " - "from the HF repo.", + parser.add_argument("--quantized_model", + help="Specify either the local path to, or name of, a quantized HF model.", required=True) parser.add_argument("--cache_dir", - help="Optionally specify a cache directory to use for a HF model " - "download.", + help="Optionally specify a cache directory to use in the event of a HF " + "model download.", default=None) parser.add_argument("--load_format", help="Optionally specify the format of the model's tensor files " @@ -219,10 +221,21 @@ def main(args): default=None) parser.add_argument("--output_path", help="Optionally specify the output directory. By default the " - "scaling factors will be saved in the model directory with the " - f"filename {_default_kvcache_scales_filename}, however you can " - "override this behavior here.", + "KV cache scaling factors will be saved in the model directory " + f"with the filename {_default_kvcache_scales_filename}, however " + "you can override this behavior here.", default=None) + parser.add_argument("--tp_size", + help="Optionally specify the tensor-parallel (TP) size that the " + "quantized model should correspond to. If specified, during KV " + "cache scaling factor extraction the observed TP size will be " + "checked against this and an error will be raised if there is " + "a mismatch. If not specified, the quantized model's expected " + "TP size is instead inferred from the largest TP rank observed. " + "The expected TP size is cross-checked against the TP ranks " + "observed in the quantized model and an error is raised if any " + "discrepancies are found.", + default=None, type=int) args = parser.parse_args() main(args) From 61f20461a577e945e05d8b2e083be4b963e53304 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Fri, 23 Feb 2024 00:54:44 +0000 Subject: [PATCH 045/159] Add specific FP8 E4M3 and ROCm flavor text to the --quantized_model argument --- 3rdparty/quantizer/extract_scales.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py index eebbd88a62d13..7a26cff7d93af 100644 --- a/3rdparty/quantizer/extract_scales.py +++ b/3rdparty/quantizer/extract_scales.py @@ -205,7 +205,9 @@ def main(args): "--kv_cache_scales_path ). This is only used " "if the KV cache dtype is FP8 and on ROCm (AMD GPU).") parser.add_argument("--quantized_model", - help="Specify either the local path to, or name of, a quantized HF model.", + help="Specify either the local path to, or name of, a quantized HF model. " + "It is expected that the quantization format is FP8_E4M3, for use on ROCm " + "(AMD GPU).", required=True) parser.add_argument("--cache_dir", help="Optionally specify a cache directory to use in the event of a HF " From dc71088e9607b7bc13bd09cf484062b0dde2b10d Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Fri, 23 Feb 2024 01:00:44 +0000 Subject: [PATCH 046/159] Small tweak on expected TP size flavor text for clarity --- 3rdparty/quantizer/extract_scales.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py index 7a26cff7d93af..84a28c33a7d24 100644 --- a/3rdparty/quantizer/extract_scales.py +++ b/3rdparty/quantizer/extract_scales.py @@ -184,8 +184,8 @@ def main(args): empirical_tp_world_size = max(rank_tensors_map.keys()) + 1 if args.tp_size is not None: assert args.tp_size == empirical_tp_world_size, "User expected TP world size = " \ - f"{args.tp_size} but expecting TP world size = {empirical_tp_world_size} from " \ - "model instead." + f"{args.tp_size} from model but tool is expecting TP world size = " \ + f"{empirical_tp_world_size} from model instead." for i in range(empirical_tp_world_size): assert i in rank_tensors_map, f"Expected TP world size = {empirical_tp_world_size} " \ "but did not find KV cache scaling factors " \ From 4cd76e9eb9def90c13be907fb6335aea2b64f268 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Fri, 23 Feb 2024 19:03:49 +0000 Subject: [PATCH 047/159] Add output filename argument, rename output_path to output_dir, and clean up some logic --- 3rdparty/quantizer/extract_scales.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py index 84a28c33a7d24..ac859f79e88d0 100644 --- a/3rdparty/quantizer/extract_scales.py +++ b/3rdparty/quantizer/extract_scales.py @@ -110,9 +110,6 @@ def _hf_tensorfile_iterator(filename: str, load_format: str, torch.cuda.empty_cache() -# Used by both main and if __name__ == "__main__" -_default_kvcache_scales_filename = "kv_cache_scales.json" - def main(args): rank_tensors_map = {} hf_folder, hf_tensor_files, use_safetensors = _prepare_hf_weights( @@ -157,7 +154,7 @@ def main(args): for name, param in _hf_tensorfile_iterator(tensor_file, args.load_format, use_safetensors): if "kv_cache_scaling_factor" in name: - nums = [int(s) for s in name.split(module_delimiter) if s.isdigit()] + nums = [int(s) for s in name.split(module_delimiter) if s.isdecimal()] assert len(nums) == 1, f"Could not determine layer idx for {name}" layer_idx = nums[0] assert layer_idx not in layer_scales_map, f"Duplicate scaling " \ @@ -171,14 +168,14 @@ def main(args): raise if args.output_path is None: - output_file = os.path.join(hf_folder, _default_kvcache_scales_filename) + output_file = os.path.join(hf_folder, args.output_name) else: - output_file = os.path.join(args.output_path, _default_kvcache_scales_filename) + output_file = os.path.join(args.output_path, args.output_name) if not os.path.isdir(args.output_path): os.makedirs(args.output_path, exist_ok=True) - if (len(rank_tensors_map) == 0 or - all(len(layer_scales_map) == 0 for layer_scales_map in rank_tensors_map.values())): + if all(len(layer_scales_map) == 0 for layer_scales_map in rank_tensors_map.values()): + # Note: this is true even if the rank_tensors_map is empty print("WARNING: No KV cache scale factors found. No output saved.") else: empirical_tp_world_size = max(rank_tensors_map.keys()) + 1 @@ -221,12 +218,14 @@ def main(args): parser.add_argument("--revision", help="Optionally specify the model's revision number.", default=None) - parser.add_argument("--output_path", + parser.add_argument("--output_dir", help="Optionally specify the output directory. By default the " - "KV cache scaling factors will be saved in the model directory " - f"with the filename {_default_kvcache_scales_filename}, however " - "you can override this behavior here.", + "KV cache scaling factors will be saved in the model directory, " + "however you can override this behavior here.", default=None) + parser.add_argument("--output_name", + help="Optionally specify the output filename.", + default="kv_cache_scales.json") parser.add_argument("--tp_size", help="Optionally specify the tensor-parallel (TP) size that the " "quantized model should correspond to. If specified, during KV " From ad8b841127a9db462fe5bde5c547e01cf8d4d66b Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Fri, 23 Feb 2024 19:06:22 +0000 Subject: [PATCH 048/159] Fix up remaining 'output_path's from the rename --- 3rdparty/quantizer/extract_scales.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py index ac859f79e88d0..357a971f96830 100644 --- a/3rdparty/quantizer/extract_scales.py +++ b/3rdparty/quantizer/extract_scales.py @@ -167,12 +167,12 @@ def main(args): "scale factor.") raise - if args.output_path is None: + if args.output_dir is None: output_file = os.path.join(hf_folder, args.output_name) else: - output_file = os.path.join(args.output_path, args.output_name) - if not os.path.isdir(args.output_path): - os.makedirs(args.output_path, exist_ok=True) + output_file = os.path.join(args.output_dir, args.output_name) + if not os.path.isdir(args.output_dir): + os.makedirs(args.output_dir, exist_ok=True) if all(len(layer_scales_map) == 0 for layer_scales_map in rank_tensors_map.values()): # Note: this is true even if the rank_tensors_map is empty From fec223242f57a84acb8434da5ba400f256da0bec Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Fri, 23 Feb 2024 20:57:20 +0000 Subject: [PATCH 049/159] Add scaling factor correction for ROCm FP8 --- vllm/model_executor/layers/attention.py | 4 ++++ vllm/model_executor/models/llama.py | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index 5fc69f3593114..c24ba13f12ee3 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -55,6 +55,10 @@ def __init__( # This will be set to a float by model initialization per attention, # if and only if we are using it. N.B. currently we only support per # tensor scalar scaling factors & only applicable to ROCm (AMD GPU). + # The scaling factor convention we are assuming is + # quantized_value * scaling_factor ~= true_value + # which is consistent with the practice of setting + # scaling_factor = tensor_amax / FPtype_max self.kv_cache_scaling_factor = None assert self.num_heads % self.num_kv_heads == 0 diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index cd10653b8c1ff..4125afd2c9bd9 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -47,6 +47,7 @@ kv_cache_scales_iterator) from vllm.sequence import SamplerOutput from vllm.config import LoRAConfig +from vllm.utils import is_hip KVCache = Tuple[torch.Tensor, torch.Tensor] @@ -363,4 +364,10 @@ def load_weights(self, def load_kv_cache_scales(self, filename: str) -> None: for layer_idx, scaling_factor in kv_cache_scales_iterator(filename): layer_paged_attn = self.model.layers[layer_idx].self_attn.attn + if is_hip(): + # The scaling factor convention we are assuming is + # quantized_value * scaling_factor ~= true_value + # which is consistent with the practice of setting + # scaling_factor = tensor_amax / FPtype_max + scaling_factor *= 2 layer_paged_attn.kv_cache_scaling_factor = scaling_factor From 7fdcf10591e5e6f65ec609f979684caee596ccd7 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 23 Feb 2024 21:18:46 +0000 Subject: [PATCH 050/159] Add example output for extract_scales --- .../llama2-7b-fp8-kv/kv_cache_scales.json | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json diff --git a/tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json b/tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json new file mode 100644 index 0000000000000..3a29f7e321391 --- /dev/null +++ b/tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json @@ -0,0 +1,36 @@ +{ + "0": { + "0": 0.0152239128947258, + "1": 0.0188860222697258, + "2": 0.0354178324341774, + "3": 0.0376674123108387, + "4": 0.0418526791036129, + "5": 0.0433175228536129, + "6": 0.0397600457072258, + "7": 0.0424455925822258, + "8": 0.0415387861430645, + "9": 0.0408412404358387, + "10": 0.0395856611430645, + "11": 0.0377371683716774, + "12": 0.0400739423930645, + "13": 0.040771484375, + "14": 0.0393415205180645, + "15": 0.0369001142680645, + "16": 0.03857421875, + "17": 0.0387486070394516, + "18": 0.0403180830180645, + "19": 0.0396205373108387, + "20": 0.0375627800822258, + "21": 0.0407366082072258, + "22": 0.0432477705180645, + "23": 0.0377022884786129, + "24": 0.0399693101644516, + "25": 0.0374581478536129, + "26": 0.0413295216858387, + "27": 0.0442243330180645, + "28": 0.0424804724752903, + "29": 0.0456891767680645, + "30": 0.0409109964966774, + "31": 0.0482352152466774 + } +} \ No newline at end of file From 6a6bbcd91da33eae7a2215fa0febef4c85b08c89 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Fri, 23 Feb 2024 21:31:20 +0000 Subject: [PATCH 051/159] Strip out download functionality in scale extraction utility --- 3rdparty/quantizer/extract_scales.py | 62 +++++++--------------------- vllm/model_executor/weight_utils.py | 4 +- 2 files changed, 17 insertions(+), 49 deletions(-) diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py index 357a971f96830..ee5e7d3068610 100644 --- a/3rdparty/quantizer/extract_scales.py +++ b/3rdparty/quantizer/extract_scales.py @@ -11,20 +11,17 @@ # Adapted from vllm/model_executor/weight_utils.py -# The main differences are that we add the NPZ format and that there's no -# need for a file lock when downloading model weights because this tool is -# not intended to be run on multiple processes simultaneously. -# Since our use case is sufficiently different, we define our own function -# here. +# The main differences are that we add the NPZ format and simplify +# its functionality drastically for our purposes (e.g. we assume that +# the quantized model exists locally and there is no need to download it) def _prepare_hf_weights( - model_name_or_path: str, - cache_dir: Optional[str] = None, + quantized_model_dir: str, load_format: str = "auto", fall_back_to_pt: bool = True, - revision: Optional[str] = None, ) -> Tuple[str, List[str], bool]: - # Download model weights from huggingface. - is_local = os.path.isdir(model_name_or_path) + if not os.path.isdir(quantized_model_dir): + raise FileNotFoundError(f"The quantized model directory `{quantized_model_dir}` " + "does not exist.") use_safetensors = False # Some quantized models use .pt files for storing the weights. if load_format == "auto": @@ -38,34 +35,17 @@ def _prepare_hf_weights( allow_patterns = ["*.npz"] else: raise ValueError(f"Unknown load_format: {load_format}") - if fall_back_to_pt: allow_patterns += ["*.pt"] - if not is_local: - # Before we download we look at that is available: - fs = HfFileSystem() - file_list = fs.ls(model_name_or_path, detail=False, revision=revision) - # depending on what is available we download different things - for pattern in allow_patterns: - matching = fnmatch.filter(file_list, pattern) - if len(matching) > 0: - allow_patterns = [pattern] - break - print(f"Downloading model... Using model weights format {allow_patterns}") - hf_folder = snapshot_download(model_name_or_path, - allow_patterns=allow_patterns, - cache_dir=cache_dir, - revision=revision) - else: - hf_folder = model_name_or_path hf_weights_files: List[str] = [] for pattern in allow_patterns: - hf_weights_files += glob.glob(os.path.join(hf_folder, pattern)) + hf_weights_files += glob.glob(os.path.join(quantized_model_dir, pattern)) if len(hf_weights_files) > 0: if pattern == "*.safetensors": use_safetensors = True break + if not use_safetensors: # Exclude files that are not needed for inference. # https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233 @@ -83,9 +63,9 @@ def _prepare_hf_weights( if len(hf_weights_files) == 0: raise RuntimeError( - f"Cannot find any model weights with `{model_name_or_path}`") + f"Cannot find any model weights with `{quantized_model_dir}`") - return hf_folder, hf_weights_files, use_safetensors + return hf_weights_files, use_safetensors # Adapted from vllm/model_executor/weight_utils.py @@ -112,12 +92,7 @@ def _hf_tensorfile_iterator(filename: str, load_format: str, def main(args): rank_tensors_map = {} - hf_folder, hf_tensor_files, use_safetensors = _prepare_hf_weights( - args.quantized_model, - args.cache_dir, - args.load_format, - revision=args.revision, - fall_back_to_pt=True) + hf_tensor_files, use_safetensors = _prepare_hf_weights(args.quantized_model, args.load_format) # Matches the number immediately after this keyword in the tensor filename to # determine the TP rank corresponding to said tensor file rank_keyword = "rank" @@ -152,7 +127,7 @@ def main(args): module_delimiter = ":" if args.load_format == "npz" else "." for name, param in _hf_tensorfile_iterator(tensor_file, args.load_format, - use_safetensors): + use_safetensors): if "kv_cache_scaling_factor" in name: nums = [int(s) for s in name.split(module_delimiter) if s.isdecimal()] assert len(nums) == 1, f"Could not determine layer idx for {name}" @@ -168,7 +143,7 @@ def main(args): raise if args.output_dir is None: - output_file = os.path.join(hf_folder, args.output_name) + output_file = os.path.join(args.quantized_model, args.output_name) else: output_file = os.path.join(args.output_dir, args.output_name) if not os.path.isdir(args.output_dir): @@ -202,22 +177,15 @@ def main(args): "--kv_cache_scales_path ). This is only used " "if the KV cache dtype is FP8 and on ROCm (AMD GPU).") parser.add_argument("--quantized_model", - help="Specify either the local path to, or name of, a quantized HF model. " + help="Specify the directory containing a single quantized HF model. " "It is expected that the quantization format is FP8_E4M3, for use on ROCm " "(AMD GPU).", required=True) - parser.add_argument("--cache_dir", - help="Optionally specify a cache directory to use in the event of a HF " - "model download.", - default=None) parser.add_argument("--load_format", help="Optionally specify the format of the model's tensor files " "containing the KV cache scaling factors.", choices=["auto", "safetensors", "npz", "pt"], default="auto") - parser.add_argument("--revision", - help="Optionally specify the model's revision number.", - default=None) parser.add_argument("--output_dir", help="Optionally specify the output directory. By default the " "KV cache scaling factors will be saved in the model directory, " diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index 85441281eb4a9..44515916db685 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -309,11 +309,11 @@ def kv_cache_scales_iterator(filename: str, return layer_scales_map.items() except FileNotFoundError: - logger.error(f"File '{filename}' not found.") + logger.error(f"File or directory '{filename}' not found.") except json.JSONDecodeError: logger.error(f"Error decoding JSON in file '{filename}'.") except Exception as e: - logger.error(f"An error occurred while reading file '{filename}': {e}") + logger.error(f"An error occurred while reading '{filename}': {e}") # This section is only reached if any of the excepts are hit # Return an empty iterator (tuple) => no KV cache scales are loaded # which effectively defaults to 1.0 scales From 8e108d3810f8cd641fc2d22f1d8068937bcd6bbb Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Fri, 23 Feb 2024 21:52:05 +0000 Subject: [PATCH 052/159] Correcting a stray type hint --- vllm/model_executor/weight_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index 5b41d94f178a7..2706f994476c8 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -262,7 +262,7 @@ def hf_model_weights_iterator( torch.cuda.empty_cache() -def kv_cache_scales_iterator(filename: str) -> Iterator[Tuple[int, torch.Tensor]]: +def kv_cache_scales_iterator(filename: str) -> Iterator[Tuple[int, float]]: """ A simple utility to read in KV cache scaling factors that have been previously serialized to disk. Used by the model to populate the appropriate From 4dd7d1eab583a5e920a218afc1f94b45f7da5a1b Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Fri, 23 Feb 2024 22:02:31 +0000 Subject: [PATCH 053/159] Correct a stray type hint --- vllm/model_executor/weight_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index 44515916db685..a8a59ca334607 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -265,7 +265,7 @@ def hf_model_weights_iterator( def kv_cache_scales_iterator(filename: str, tp_rank: int, tp_size: int, - num_hidden_layers: int) -> Iterator[Tuple[int, torch.Tensor]]: + num_hidden_layers: int) -> Iterator[Tuple[int, float]]: """ A simple utility to read in KV cache scaling factors that have been previously serialized to disk. Used by the model to populate the appropriate From 9a03b9606277cf120e2cc9aacc120d184028f66f Mon Sep 17 00:00:00 2001 From: Adrian Abeyta Date: Fri, 23 Feb 2024 16:45:00 -0600 Subject: [PATCH 054/159] Create README.md and add usage example --- tests/fp8_kv/README.md | 105 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 tests/fp8_kv/README.md diff --git a/tests/fp8_kv/README.md b/tests/fp8_kv/README.md new file mode 100644 index 0000000000000..ccfb24686c341 --- /dev/null +++ b/tests/fp8_kv/README.md @@ -0,0 +1,105 @@ +# FP8 KV Cache Extraction Tool + +This utility extracts the KV cache scaling factors from a quantized HF (Hugging Face) model. The extracted scaling factors are saved to a JSON file, which can later be used by vLLM (variable-length language model) during runtime. This tool is particularly useful when the KV cache data type is FP8 and is intended for use on ROCm (AMD GPU) platforms. + +## Prerequisites + +- Python 3.x +- PyTorch +- NumPy +- Hugging Face Transformers +- Hugging Face Hub +- AMMO + +Before using this tool, you'll need to follow these steps: + +1. Install all necessary prerequisites and dependencies. +2. Convert HF model into a quantized HF model. +3. Extract KV Cache Scaling Factors from quantized HF model. +4. Load KV Cache Scaling Factors into VLLM. + +### 2. Convert HF model into a quantized HF model. +Note: The following steps are adapted from the [TensorRT-LLM repository](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/README.md). +## APIs + +[`quantize.py`](3rdparty/quantizer/quantize.py) uses the quantization toolkit (AMMO) to calibrate the PyTorch models and export TensorRT-LLM checkpoints. Each TensorRT-LLM checkpoint contains a config file (in .json format) and one or several rank weight files (in .safetensors format). See this [`doc`](../../docs/source/new_workflow.md) for more details on the TensorRT-LLM checkpoint format. + +The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found [here](https://github.com/ROCm/vllm-fp8/blob/fp8_kv/3rdparty/README.md). + +### 3. Extract KV Cache Scaling Factors from quantized HF model. +extract_scales.py can be utilized to extract the KV cache scaling factors from your quantized HF model, however at the moment, this tool exclusively supports LLaMa models. It is also important to note the following: +1. **File Structure**: The utility operates under the assumption that all parameters, including KV cache scaling factors, corresponding to a particular Tensor Parallelism (TP) rank are stored in a single file. These files must adhere to a specific naming convention where the TP rank is immediately identified after a specific keyword (e.g., "rank") in the filename. + +2. **TP Decomposition**: The utility assumes consistency between the TP decomposition employed by the quantizer tool and that used by vLLM. + +3. **AMMO Compatibility**: Currently, the generated KV cache scaling factors for AMMO remain uniform across all TP ranks. + +```python +# prerequisites: +# - Quantized HF LLaMa model +python3 3rdparty/quantizer/extract_scales.py --help +Useage: extract_scales.py [-h] --quantized_model QUANTIZED_MODEL [--cache_dir CACHE_DIR] [--load_format {auto,safetensors,npz,pt}] [--revision REVISION] [--output_dir OUTPUT_DIR] [--output_name OUTPUT_NAME] [--tp_size TP_SIZE] + +KV Scale Extraction Example + +optional arguments: +--quantized_model: Specify either the local path to, or name of, a quantized HF model. It is expected that the quantization format is FP8_E4M3, for use on ROCm (AMD GPU). +Optional arguments: + +--cache_dir: Specify a cache directory to use in the event of a HF model download. (Default: None) +--load_format: Specify the format of the model's tensor files containing the KV cache scaling factors. (Choices: auto, safetensors, npz, pt; Default: auto) +--revision: Specify the model's revision number. (Default: None) +--output_dir: Specify the output directory. By default the KV cache scaling factors will be saved in the model directory. (Default: None) +--output_name: Specify the output filename. (Default: kv_cache_scales.json) +--tp_size: Specify the tensor-parallel (TP) size that the quantized model should correspond to. If specified, during KV cache scaling factor extraction the observed TP size will be checked against this and an error will be raised if there is a mismatch. (Default: None) + +Example: +python3 3rdparty/quantizer/extract_scales.py --model --tp_size --output_dir + +### 4. Load KV Cache Scaling Factors into VLLM. +# prerequisites: +# - LLaMa kv_cache_scales.json file + +python3 benchmarks/benchmark_throughput.py --help +usage: benchmark_throughput.py [-h] [--backend {vllm,hf,mii}] [--dataset DATASET] [--input-len INPUT_LEN] [--output-len OUTPUT_LEN] [--model MODEL] + [--tokenizer TOKENIZER] [--quantization {awq,gptq,squeezellm,None}] [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--n N] + [--use-beam-search] [--num-prompts NUM_PROMPTS] [--seed SEED] [--hf-max-batch-size HF_MAX_BATCH_SIZE] [--trust-remote-code] + [--max-model-len MAX_MODEL_LEN] [--dtype {auto,half,float16,bfloat16,float,float32}] [--enforce-eager] [--kv-cache-dtype {auto,fp8}] + [--kv-cache-scales-path KV_CACHE_SCALES_PATH] + +Benchmark Throughput Example +optional arguments: + -h, --help show this help message and exit + --backend {vllm,hf,mii} + --dataset DATASET Path to the dataset. + --input-len INPUT_LEN + Input prompt length for each request + --output-len OUTPUT_LEN + Output length for each request. Overrides the output length from the dataset. + --model MODEL + --tokenizer TOKENIZER + --quantization {awq,gptq,squeezellm,None}, -q {awq,gptq,squeezellm,None} + --tensor-parallel-size TENSOR_PARALLEL_SIZE, -tp TENSOR_PARALLEL_SIZE + --n N Number of generated sequences per prompt. + --use-beam-search + --num-prompts NUM_PROMPTS + Number of prompts to process. + --seed SEED + --hf-max-batch-size HF_MAX_BATCH_SIZE + Maximum batch size for HF backend. + --trust-remote-code trust remote code from huggingface + --max-model-len MAX_MODEL_LEN + Maximum length of a sequence (including prompt and output). If None, will be derived from the model. + --dtype {auto,half,float16,bfloat16,float,float32} + data type for model weights and activations. The "auto" option will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 + models. + --enforce-eager enforce eager execution + --kv-cache-dtype {auto,fp8} + Data type for kv cache storage. If "auto", will use model data type. FP8_E5M2 (without scaling) is only supported on cuda version greater than + 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria. + --kv-cache-scales-path KV_CACHE_SCALES_PATH + Path to the JSON files containing the KV cache scaling factors. This should generally be supplied, when KV cache dtype is FP8. Otherwise, KV cache + scaling factors default to 1.0, which may cause accuracy issues. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. + On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria. +Example: +python3 benchmarks/benchmark_throughput.py --input-len --output-len -tp --kv-cache-dtype fp8 --kv-cache-scales-path From 4064973e057a67465e9c1a48d98e1920f85c8475 Mon Sep 17 00:00:00 2001 From: Adrian Abeyta Date: Fri, 23 Feb 2024 16:52:31 -0600 Subject: [PATCH 055/159] Added benchmark description --- tests/fp8_kv/README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/fp8_kv/README.md b/tests/fp8_kv/README.md index ccfb24686c341..da2d5c5c7acf0 100644 --- a/tests/fp8_kv/README.md +++ b/tests/fp8_kv/README.md @@ -55,8 +55,10 @@ Optional arguments: Example: python3 3rdparty/quantizer/extract_scales.py --model --tp_size --output_dir - +``` ### 4. Load KV Cache Scaling Factors into VLLM. +The script evaluates the inference throughput of language models using various backends such as vLLM, HF (Hugging Face). It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for kv cache scaling factors to be utilized for FP8. +```python # prerequisites: # - LLaMa kv_cache_scales.json file @@ -103,3 +105,4 @@ optional arguments: On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria. Example: python3 benchmarks/benchmark_throughput.py --input-len --output-len -tp --kv-cache-dtype fp8 --kv-cache-scales-path +```python From 988ffc3dfceb238392d262389d0697bfbfd8f71c Mon Sep 17 00:00:00 2001 From: Adrian Abeyta Date: Fri, 23 Feb 2024 20:00:46 -0600 Subject: [PATCH 056/159] Clean up readme --- tests/fp8_kv/README.md | 62 +++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 37 deletions(-) diff --git a/tests/fp8_kv/README.md b/tests/fp8_kv/README.md index da2d5c5c7acf0..e71a8b58dce69 100644 --- a/tests/fp8_kv/README.md +++ b/tests/fp8_kv/README.md @@ -1,4 +1,4 @@ -# FP8 KV Cache Extraction Tool +# FP8 KV Cache This utility extracts the KV cache scaling factors from a quantized HF (Hugging Face) model. The extracted scaling factors are saved to a JSON file, which can later be used by vLLM (variable-length language model) during runtime. This tool is particularly useful when the KV cache data type is FP8 and is intended for use on ROCm (AMD GPU) platforms. @@ -11,8 +11,7 @@ This utility extracts the KV cache scaling factors from a quantized HF (Hugging - Hugging Face Hub - AMMO -Before using this tool, you'll need to follow these steps: - +Before incorporating the FP8 datatype for inference workloads, you must adhere to the following steps: 1. Install all necessary prerequisites and dependencies. 2. Convert HF model into a quantized HF model. 3. Extract KV Cache Scaling Factors from quantized HF model. @@ -20,14 +19,13 @@ Before using this tool, you'll need to follow these steps: ### 2. Convert HF model into a quantized HF model. Note: The following steps are adapted from the [TensorRT-LLM repository](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/README.md). -## APIs -[`quantize.py`](3rdparty/quantizer/quantize.py) uses the quantization toolkit (AMMO) to calibrate the PyTorch models and export TensorRT-LLM checkpoints. Each TensorRT-LLM checkpoint contains a config file (in .json format) and one or several rank weight files (in .safetensors format). See this [`doc`](../../docs/source/new_workflow.md) for more details on the TensorRT-LLM checkpoint format. +[`quantize.py`](https://github.com/ROCm/vllm-fp8/blob/fp8_doc/3rdparty/quantizer/quantize.py) uses the quantization toolkit (AMMO) to calibrate the PyTorch models and export TensorRT-LLM checkpoints. Each TensorRT-LLM checkpoint contains a config file (in .json format) and one or several rank weight files (in .safetensors format). The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found [here](https://github.com/ROCm/vllm-fp8/blob/fp8_kv/3rdparty/README.md). ### 3. Extract KV Cache Scaling Factors from quantized HF model. -extract_scales.py can be utilized to extract the KV cache scaling factors from your quantized HF model, however at the moment, this tool exclusively supports LLaMa models. It is also important to note the following: +[`extract_scales.py`](https://github.com/ROCm/vllm-fp8/blob/fp8_doc/3rdparty/quantizer/extract_scales.py) can be utilized to extract the KV cache scaling factors from your quantized HF model, however at the moment, this tool exclusively supports Llama 2 models. It is also important to note the following: 1. **File Structure**: The utility operates under the assumption that all parameters, including KV cache scaling factors, corresponding to a particular Tensor Parallelism (TP) rank are stored in a single file. These files must adhere to a specific naming convention where the TP rank is immediately identified after a specific keyword (e.g., "rank") in the filename. 2. **TP Decomposition**: The utility assumes consistency between the TP decomposition employed by the quantizer tool and that used by vLLM. @@ -36,7 +34,7 @@ extract_scales.py can be utilized to extract the KV cache scaling factors from y ```python # prerequisites: -# - Quantized HF LLaMa model +# - Quantized HF LLaMa 2 model python3 3rdparty/quantizer/extract_scales.py --help Useage: extract_scales.py [-h] --quantized_model QUANTIZED_MODEL [--cache_dir CACHE_DIR] [--load_format {auto,safetensors,npz,pt}] [--revision REVISION] [--output_dir OUTPUT_DIR] [--output_name OUTPUT_NAME] [--tp_size TP_SIZE] @@ -45,22 +43,22 @@ KV Scale Extraction Example optional arguments: --quantized_model: Specify either the local path to, or name of, a quantized HF model. It is expected that the quantization format is FP8_E4M3, for use on ROCm (AMD GPU). Optional arguments: - --cache_dir: Specify a cache directory to use in the event of a HF model download. (Default: None) --load_format: Specify the format of the model's tensor files containing the KV cache scaling factors. (Choices: auto, safetensors, npz, pt; Default: auto) --revision: Specify the model's revision number. (Default: None) --output_dir: Specify the output directory. By default the KV cache scaling factors will be saved in the model directory. (Default: None) --output_name: Specify the output filename. (Default: kv_cache_scales.json) --tp_size: Specify the tensor-parallel (TP) size that the quantized model should correspond to. If specified, during KV cache scaling factor extraction the observed TP size will be checked against this and an error will be raised if there is a mismatch. (Default: None) - +``` +```python Example: -python3 3rdparty/quantizer/extract_scales.py --model --tp_size --output_dir +python3 3rdparty/quantizer/extract_scales.py --model --tp_size --output_dir ``` ### 4. Load KV Cache Scaling Factors into VLLM. -The script evaluates the inference throughput of language models using various backends such as vLLM, HF (Hugging Face). It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for kv cache scaling factors to be utilized for FP8. +This script evaluates the inference throughput of language models using various backends such as vLLM. It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for kv cache scaling factors to be utilized for FP8. ```python # prerequisites: -# - LLaMa kv_cache_scales.json file +# - LLaMa 2 kv_cache_scales.json file python3 benchmarks/benchmark_throughput.py --help usage: benchmark_throughput.py [-h] [--backend {vllm,hf,mii}] [--dataset DATASET] [--input-len INPUT_LEN] [--output-len OUTPUT_LEN] [--model MODEL] @@ -71,38 +69,28 @@ usage: benchmark_throughput.py [-h] [--backend {vllm,hf,mii}] [--dataset DATASET Benchmark Throughput Example optional arguments: - -h, --help show this help message and exit + -h, --help show this help message and exit --backend {vllm,hf,mii} - --dataset DATASET Path to the dataset. - --input-len INPUT_LEN - Input prompt length for each request - --output-len OUTPUT_LEN - Output length for each request. Overrides the output length from the dataset. + --dataset DATASET Path to the dataset. + --input-len INPUT_LEN Input prompt length for each request + --output-len OUTPUT_LEN Output length for each request. Overrides the output length from the dataset. --model MODEL --tokenizer TOKENIZER --quantization {awq,gptq,squeezellm,None}, -q {awq,gptq,squeezellm,None} --tensor-parallel-size TENSOR_PARALLEL_SIZE, -tp TENSOR_PARALLEL_SIZE - --n N Number of generated sequences per prompt. + --n N Number of generated sequences per prompt. --use-beam-search - --num-prompts NUM_PROMPTS - Number of prompts to process. + --num-prompts NUM_PROMPTS Number of prompts to process. --seed SEED - --hf-max-batch-size HF_MAX_BATCH_SIZE - Maximum batch size for HF backend. - --trust-remote-code trust remote code from huggingface - --max-model-len MAX_MODEL_LEN - Maximum length of a sequence (including prompt and output). If None, will be derived from the model. - --dtype {auto,half,float16,bfloat16,float,float32} - data type for model weights and activations. The "auto" option will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 - models. - --enforce-eager enforce eager execution - --kv-cache-dtype {auto,fp8} - Data type for kv cache storage. If "auto", will use model data type. FP8_E5M2 (without scaling) is only supported on cuda version greater than - 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria. - --kv-cache-scales-path KV_CACHE_SCALES_PATH - Path to the JSON files containing the KV cache scaling factors. This should generally be supplied, when KV cache dtype is FP8. Otherwise, KV cache - scaling factors default to 1.0, which may cause accuracy issues. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. - On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria. + --hf-max-batch-size HF_MAX_BATCH_SIZE Maximum batch size for HF backend. + --trust-remote-code trust remote code from huggingface + --max-model-len MAX_MODEL_LEN Maximum length of a sequence (including prompt and output). If None, will be derived from the model. + --dtype {auto,half,float16,bfloat16,float,float32} data type for model weights and activations. The "auto" option will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models. + --enforce-eager enforce eager execution + --kv-cache-dtype {auto,fp8} Data type for kv cache storage. If "auto", will use model data type. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported ```for common inference criteria. + --kv-cache-scales-path KV_CACHE_SCALES_PATH Path to the JSON files containing the KV cache scaling factors. This should generally be supplied, when KV cache dtype is FP8. Otherwise, KV cache scaling factors default to 1.0, which may cause accuracy issues. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria. +``` +``` Example: python3 benchmarks/benchmark_throughput.py --input-len --output-len -tp --kv-cache-dtype fp8 --kv-cache-scales-path ```python From 534dcff5ab7bbb1f7e9971438bbd4e9cd88a04fc Mon Sep 17 00:00:00 2001 From: Doug Lehr Date: Mon, 26 Feb 2024 18:07:20 +0000 Subject: [PATCH 057/159] Don't broadcast when using torchrun --- vllm/model_executor/layers/tuned_gemm.py | 6 +++++- vllm/worker/model_runner.py | 3 ++- vllm/worker/worker.py | 15 ++++++++++----- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/layers/tuned_gemm.py b/vllm/model_executor/layers/tuned_gemm.py index 1dc0dcc9a4670..bebab27ebfd86 100644 --- a/vllm/model_executor/layers/tuned_gemm.py +++ b/vllm/model_executor/layers/tuned_gemm.py @@ -97,7 +97,11 @@ def mm(self,inp,weights): #print(">>> found rocblas") out = rocb_mm(inp_view,weights.t(),solidx) else: - #print('>>>Tgemm Default',inp.shape,weights.shape,soltype,solidx) + + if (self.save_gemm == 1): + print('>>>Tgemm Default',inp_view.shape, inp.shape,weights.shape,soltype,solidx) + self.tuned_df = pd.concat([self.tuned_df, pd.DataFrame({'M':[weights.shape[0]], 'N':[inp.shape[0]*inp.shape[1]], 'K':[weights.shape[1]]})]).drop_duplicates() + self.tuned_df.to_csv(self.untune_path, index=False) out = F.linear(inp,weights) if batched: return out.view(inp.shape[0], inp.shape[1], weights.shape[0]) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 985115613e044..f7e40bae30990 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -25,6 +25,7 @@ # Capture graphs for batch size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256. # NOTE: _get_graph_batch_size needs to be updated if this list is changed. _BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [8 * i for i in range(1, 33)] +#_BATCH_SIZES_TO_CAPTURE = [1] class ModelRunner: @@ -477,7 +478,7 @@ def prepare_input_tensors( "lora_requests": lora_requests, "lora_mapping": lora_mapping, } - broadcast_tensor_dict(metadata_dict, src=0) + #broadcast_tensor_dict(metadata_dict, src=0) else: metadata_dict = broadcast_tensor_dict(src=0) input_tokens = metadata_dict["input_tokens"] diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index aafd7306acf5d..0121486dce246 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -46,6 +46,9 @@ def __init__( self.distributed_init_method = distributed_init_method self.lora_config = lora_config self.is_driver_worker = is_driver_worker + local_rank = int(os.getenv("LOCAL_RANK", "0")) + self.local_rank = local_rank + if self.is_driver_worker: assert self.rank == 0, "The driver worker must have rank 0." @@ -53,7 +56,7 @@ def __init__( parallel_config, scheduler_config, lora_config=self.lora_config, - is_driver_worker=is_driver_worker) + is_driver_worker=self.is_driver_worker) # Uninitialized cache engine. Will be initialized by # self.init_cache_engine(). self.cache_config = None @@ -74,8 +77,10 @@ def init_model(self) -> None: os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None) self.rank = self.rank if self.rank is not None else int( os.getenv("RANK", "-1")) - local_rank = int(os.getenv("LOCAL_RANK", "0")) - self.device = torch.device(f"cuda:{local_rank}") + + self.device = torch.device(f"cuda:{self.local_rank}") + + torch.cuda.set_device(self.device) _check_if_gpu_supports_dtype(self.model_config.dtype) @@ -182,7 +187,7 @@ def execute_model( blocks_to_swap_out: Optional[Dict[int, int]] = None, blocks_to_copy: Optional[Dict[int, List[int]]] = None, ) -> Optional[SamplerOutput]: - if self.is_driver_worker: + if self.is_driver_worker and self.rank == 0: assert seq_group_metadata_list is not None num_seq_groups = len(seq_group_metadata_list) assert blocks_to_swap_in is not None @@ -194,7 +199,7 @@ def execute_model( "blocks_to_swap_out": blocks_to_swap_out, "blocks_to_copy": blocks_to_copy, } - broadcast_tensor_dict(data, src=0) + #broadcast_tensor_dict(data, src=0) else: data = broadcast_tensor_dict(src=0) num_seq_groups = data["num_seq_groups"] From ee6ba29d5513a35d2d240819182c46e420cc2d85 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Mon, 26 Feb 2024 20:28:33 +0000 Subject: [PATCH 058/159] Change convention: Initialize scaling factors always if KV cache is FP8 and exceptions are handled, else terminate program execution --- vllm/model_executor/models/llama.py | 13 +++++++++++++ vllm/worker/model_runner.py | 26 +++++++++++++++----------- 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index eafb6d247342d..12ef27bd7b45c 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -361,7 +361,14 @@ def load_weights(self, weight_loader(param, loaded_weight) # Should not be called unless the KV cache dtype is FP8 on ROCm (AMD GPU) + # If this function is called, it should always initialize KV cache scale + # factors (or else raise an exception). Thus, handled exceptions should + # make sure to leave KV cache scale factors in a known good (dummy) state def load_kv_cache_scales(self, filename: str) -> None: + # Initialize KV cache scales to dummy values first. These will be + # overwritten by the actual values if and only if the later loading + # process completes without error + self.load_dummy_kv_cache_scales() tp_size = get_tensor_model_parallel_world_size() tp_rank = get_tensor_model_parallel_rank() for layer_idx, scaling_factor in kv_cache_scales_iterator( @@ -369,3 +376,9 @@ def load_kv_cache_scales(self, filename: str) -> None: self.model.config.num_hidden_layers): layer_paged_attn = self.model.layers[layer_idx].self_attn.attn layer_paged_attn.kv_cache_scaling_factor = scaling_factor + + # Should not be called unless the KV cache dtype is FP8 on ROCm (AMD GPU) + def load_dummy_kv_cache_scales(self) -> None: + for layer_idx in range(self.model.config.num_hidden_layers): + layer_paged_attn = self.model.layers[layer_idx].self_attn.attn + setattr(layer_paged_attn, "kv_cache_scaling_factor", 1.0) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index c4a48183ccc55..361ed163dba3c 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -84,21 +84,25 @@ def load_model(self) -> None: self.lora_config, self.device) self.model = self.lora_manager.create_lora_manager(self.model) - if self.model_config.kv_cache_scales_path is not None: - if self.kv_cache_dtype == "fp8": + if self.kv_cache_dtype == "fp8": + if self.model_config.kv_cache_scales_path is not None: if callable(getattr(self.model, "load_kv_cache_scales", None)): self.model.load_kv_cache_scales(self.model_config.kv_cache_scales_path) else: - logger.warn("Using FP8 KV cache and scaling factors provided but " - f"model {self.model.__class__} does not support loading " - "scaling factors. Defaulting to scaling factors of 1.0, " - "This may lead to less accurate results!") + raise RuntimeError("Using FP8 KV cache and scaling factors provided but " + f"model {self.model.__class__} does not support loading " + "scaling factors.") + elif callable(getattr(self.model, "load_dummy_kv_cache_scales", None)): + logger.warn(f"Using FP8 KV cache but no scaling factors provided. Defaulting to " + "scaling factors of 1.0, This may lead to less accurate results!") + self.model.load_dummy_kv_cache_scales() else: - logger.warn("KV cache scaling factors provided, but the KV cache data type is not FP8. " - "KV cache scaling factors will not be used.") - elif self.kv_cache_dtype == "fp8": - logger.warn(f"Using FP8 KV cache but no scaling factors provided. Defaulting to " - "scaling factors of 1.0, This may lead to less accurate results!") + raise RuntimeError(f"Using FP8 KV cache but no scaling factors provided and model " + "does not support loading dummy KV cache scaling factors.") + elif self.model_config.kv_cache_scales_path is not None: + logger.warn("KV cache scaling factors provided, but the KV cache data type is not FP8. " + "KV cache scaling factors will not be used.") + def set_block_size(self, block_size: int) -> None: self.block_size = block_size From 400765687efd5974a2e4702b4493a422615a2050 Mon Sep 17 00:00:00 2001 From: Adrian Abeyta Date: Mon, 26 Feb 2024 15:47:40 -0600 Subject: [PATCH 059/159] Updated example descriptions --- tests/fp8_kv/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/fp8_kv/README.md b/tests/fp8_kv/README.md index e71a8b58dce69..fde6e9d6187c3 100644 --- a/tests/fp8_kv/README.md +++ b/tests/fp8_kv/README.md @@ -52,7 +52,7 @@ Optional arguments: ``` ```python Example: -python3 3rdparty/quantizer/extract_scales.py --model --tp_size --output_dir +python3 3rdparty/quantizer/extract_scales.py --model --tp_size --output_dir ``` ### 4. Load KV Cache Scaling Factors into VLLM. This script evaluates the inference throughput of language models using various backends such as vLLM. It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for kv cache scaling factors to be utilized for FP8. @@ -92,5 +92,5 @@ optional arguments: ``` ``` Example: -python3 benchmarks/benchmark_throughput.py --input-len --output-len -tp --kv-cache-dtype fp8 --kv-cache-scales-path +python3 benchmarks/benchmark_throughput.py --input-len --output-len -tp --kv-cache-dtype fp8 --kv-cache-scales-path ```python From c8059c2260468df4f997a5ab76359a096d3a1576 Mon Sep 17 00:00:00 2001 From: HaiShaw Date: Mon, 26 Feb 2024 23:40:31 -0500 Subject: [PATCH 060/159] Kernel and Device functions to enable FP8 KV cache scaling factors --- csrc/attention/attention_kernels.cu | 75 +++--- csrc/cache.h | 3 +- csrc/cache_kernels.cu | 17 +- csrc/ops.h | 6 +- .../fp8/amd_detail/quant_utils.cuh | 227 ++++++++++++++++++ vllm/model_executor/layers/attention.py | 6 + 6 files changed, 292 insertions(+), 42 deletions(-) diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index 602e2e496af91..aa176d4c4d74a 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -111,7 +111,8 @@ __device__ void paged_attention_kernel( const float* __restrict__ alibi_slopes, // [num_heads] const int q_stride, const int kv_block_stride, - const int kv_head_stride) { + const int kv_head_stride, + const float kv_scale) { const int seq_idx = blockIdx.y; const int partition_idx = blockIdx.z; const int max_num_partitions = gridDim.z; @@ -231,8 +232,8 @@ __device__ void paged_attention_kernel( k_vecs[j] = fp8_e5m2_unscaled::vec_conversion(k_vec_quant); #elif defined(ENABLE_FP8_E4M3) Quant_vec k_vec_quant = *reinterpret_cast(k_ptr + offset1 * BLOCK_SIZE * x + offset2); - // Vector conversion from Quant_vec to K_vec. - k_vecs[j] = fp8_e4m3::vec_conversion(k_vec_quant); + // Vector conversion from Quant_vec to K_vec. Scaled conversion: FP8 => higher precision + k_vecs[j] = fp8_e4m3::scaled_vec_conversion(k_vec_quant, kv_scale); #else assert(false); #endif @@ -355,8 +356,8 @@ __device__ void paged_attention_kernel( v_vec = fp8_e5m2_unscaled::vec_conversion(v_quant_vec); #elif defined(ENABLE_FP8_E4M3) V_quant_vec v_quant_vec = *reinterpret_cast(v_ptr + offset); - // Vector conversion from V_quant_vec to V_vec. - v_vec = fp8_e4m3::vec_conversion(v_quant_vec); + // Vector conversion from V_quant_vec to V_vec. Scaled conversion: FP8 => higher precision + v_vec = fp8_e4m3::scaled_vec_conversion(v_quant_vec, kv_scale); #else assert(false); #endif @@ -461,11 +462,12 @@ __global__ void paged_attention_v1_kernel( const float* __restrict__ alibi_slopes, // [num_heads] const int q_stride, const int kv_block_stride, - const int kv_head_stride) { + const int kv_head_stride, + const float kv_scale) { paged_attention_kernel( /* exp_sums */ nullptr, /* max_logits */ nullptr, out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, context_lens, - max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride); + max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride, kv_scale); } // Grid: (num_heads, num_seqs, max_num_partitions). @@ -492,11 +494,12 @@ __global__ void paged_attention_v2_kernel( const float* __restrict__ alibi_slopes, // [num_heads] const int q_stride, const int kv_block_stride, - const int kv_head_stride) { + const int kv_head_stride, + const float kv_scale) { paged_attention_kernel( exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, context_lens, max_num_blocks_per_seq, alibi_slopes, - q_stride, kv_block_stride, kv_head_stride); + q_stride, kv_block_stride, kv_head_stride, kv_scale); } // Grid: (num_heads, num_seqs). @@ -603,9 +606,9 @@ __global__ void paged_attention_v2_reduce_kernel( #define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \ VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \ ((void*)vllm::paged_attention_v1_kernel), shared_mem_size); \ + IS_FP8_KV_CACHE>), shared_mem_size); \ vllm::paged_attention_v1_kernel<<>>( \ + IS_FP8_KV_CACHE><<>>( \ out_ptr, \ query_ptr, \ key_cache_ptr, \ @@ -618,7 +621,8 @@ __global__ void paged_attention_v2_reduce_kernel( alibi_slopes_ptr, \ q_stride, \ kv_block_stride, \ - kv_head_stride); + kv_head_stride, \ + kv_scale); // TODO(woosuk): Tune NUM_THREADS. template< @@ -637,7 +641,8 @@ void paged_attention_v1_launcher( torch::Tensor& block_tables, torch::Tensor& context_lens, int max_context_len, - const c10::optional& alibi_slopes) { + const c10::optional& alibi_slopes, + float kv_scale) { int num_seqs = query.size(0); int num_heads = query.size(1); int head_size = query.size(2); @@ -701,8 +706,8 @@ void paged_attention_v1_launcher( } } -#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ - paged_attention_v1_launcher( \ +#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ + paged_attention_v1_launcher( \ out, \ query, \ key_cache, \ @@ -712,20 +717,21 @@ void paged_attention_v1_launcher( block_tables, \ context_lens, \ max_context_len, \ - alibi_slopes); + alibi_slopes, \ + kv_scale); // NOTE(woosuk): To reduce the compilation time, we omitted block sizes // 1, 2, 4, 64, 128, 256. -#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_KV_CACHE) \ +#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_KV_CACHE) \ switch (block_size) { \ case 8: \ - CALL_V1_LAUNCHER(T, CACHE_T, 8, IS_FP8_KV_CACHE); \ + CALL_V1_LAUNCHER(T, CACHE_T, 8, IS_FP8_KV_CACHE); \ break; \ case 16: \ - CALL_V1_LAUNCHER(T, CACHE_T, 16, IS_FP8_KV_CACHE); \ + CALL_V1_LAUNCHER(T, CACHE_T, 16, IS_FP8_KV_CACHE); \ break; \ case 32: \ - CALL_V1_LAUNCHER(T, CACHE_T, 32, IS_FP8_KV_CACHE); \ + CALL_V1_LAUNCHER(T, CACHE_T, 32, IS_FP8_KV_CACHE); \ break; \ default: \ TORCH_CHECK(false, "Unsupported block size: ", block_size); \ @@ -744,7 +750,8 @@ void paged_attention_v1( int block_size, int max_context_len, const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype) { + const std::string& kv_cache_dtype, + float kv_scale) { if (kv_cache_dtype == "auto") { if (query.dtype() == at::ScalarType::Float) { CALL_V1_LAUNCHER_BLOCK_SIZE(float, float, false); @@ -772,7 +779,7 @@ void paged_attention_v1( #define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \ vllm::paged_attention_v2_kernel \ + IS_FP8_KV_CACHE, PARTITION_SIZE> \ <<>>( \ exp_sums_ptr, \ max_logits_ptr, \ @@ -788,7 +795,8 @@ void paged_attention_v1( alibi_slopes_ptr, \ q_stride, \ kv_block_stride, \ - kv_head_stride); \ + kv_head_stride, \ + kv_scale); \ vllm::paged_attention_v2_reduce_kernel \ <<>>( \ out_ptr, \ @@ -818,7 +826,8 @@ void paged_attention_v2_launcher( torch::Tensor& block_tables, torch::Tensor& context_lens, int max_context_len, - const c10::optional& alibi_slopes) { + const c10::optional& alibi_slopes, + float kv_scale) { int num_seqs = query.size(0); int num_heads = query.size(1); int head_size = query.size(2); @@ -888,8 +897,8 @@ void paged_attention_v2_launcher( } } -#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ - paged_attention_v2_launcher( \ +#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ + paged_attention_v2_launcher( \ out, \ exp_sums, \ max_logits, \ @@ -902,20 +911,21 @@ void paged_attention_v2_launcher( block_tables, \ context_lens, \ max_context_len, \ - alibi_slopes); + alibi_slopes, \ + kv_scale); // NOTE(woosuk): To reduce the compilation time, we omitted block sizes // 1, 2, 4, 64, 128, 256. -#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_KV_CACHE) \ +#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_KV_CACHE) \ switch (block_size) { \ case 8: \ - CALL_V2_LAUNCHER(T, CACHE_T, 8, IS_FP8_KV_CACHE); \ + CALL_V2_LAUNCHER(T, CACHE_T, 8, IS_FP8_KV_CACHE); \ break; \ case 16: \ - CALL_V2_LAUNCHER(T, CACHE_T, 16, IS_FP8_KV_CACHE); \ + CALL_V2_LAUNCHER(T, CACHE_T, 16, IS_FP8_KV_CACHE); \ break; \ case 32: \ - CALL_V2_LAUNCHER(T, CACHE_T, 32, IS_FP8_KV_CACHE); \ + CALL_V2_LAUNCHER(T, CACHE_T, 32, IS_FP8_KV_CACHE); \ break; \ default: \ TORCH_CHECK(false, "Unsupported block size: ", block_size); \ @@ -937,7 +947,8 @@ void paged_attention_v2( int block_size, int max_context_len, const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype) { + const std::string& kv_cache_dtype, + float kv_scale) { if (kv_cache_dtype == "auto") { if (query.dtype() == at::ScalarType::Float) { CALL_V2_LAUNCHER_BLOCK_SIZE(float, float, false); diff --git a/csrc/cache.h b/csrc/cache.h index aafee5524fe2c..82b90eb4ab631 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -21,7 +21,8 @@ void reshape_and_cache( torch::Tensor& key_cache, torch::Tensor& value_cache, torch::Tensor& slot_mapping, - const std::string& kv_cache_dtype); + const std::string& kv_cache_dtype, + const float kv_scale); void gather_cached_kv( torch::Tensor& key, diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index eef1501057d9a..73f61e92b1a51 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -165,7 +165,8 @@ __global__ void reshape_and_cache_kernel( const int num_heads, const int head_size, const int block_size, - const int x) { + const int x, + const float kv_scale) { const int64_t token_idx = blockIdx.x; const int64_t slot_idx = slot_mapping[token_idx]; if (slot_idx < 0) { @@ -202,8 +203,8 @@ __global__ void reshape_and_cache_kernel( key_cache[tgt_key_idx] = fp8_e5m2_unscaled::vec_conversion(tgt_key); value_cache[tgt_value_idx] = fp8_e5m2_unscaled::vec_conversion(tgt_value); #elif defined(ENABLE_FP8_E4M3) - key_cache[tgt_key_idx] = fp8_e4m3::vec_conversion(tgt_key); - value_cache[tgt_value_idx] = fp8_e4m3::vec_conversion(tgt_value); + key_cache[tgt_key_idx] = fp8_e4m3::scaled_vec_conversion(tgt_key, kv_scale); + value_cache[tgt_value_idx] = fp8_e4m3::scaled_vec_conversion(tgt_value, kv_scale); #else assert(false); #endif @@ -216,8 +217,8 @@ __global__ void reshape_and_cache_kernel( } // namespace vllm -#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, IS_FP8_KV_CACHE) \ - vllm::reshape_and_cache_kernel<<>>( \ +#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, IS_FP8_KV_CACHE) \ + vllm::reshape_and_cache_kernel<<>>( \ reinterpret_cast(key.data_ptr()), \ reinterpret_cast(value.data_ptr()), \ reinterpret_cast(key_cache.data_ptr()), \ @@ -228,7 +229,8 @@ __global__ void reshape_and_cache_kernel( num_heads, \ head_size, \ block_size, \ - x); + x, \ + kv_scale); void reshape_and_cache( torch::Tensor& key, // [num_tokens, num_heads, head_size] @@ -236,7 +238,8 @@ void reshape_and_cache( torch::Tensor& key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] torch::Tensor& value_cache, // [num_blocks, num_heads, head_size, block_size] torch::Tensor& slot_mapping, // [num_tokens] - const std::string& kv_cache_dtype) + const std::string& kv_cache_dtype, + const float kv_scale) { int num_tokens = key.size(0); int num_heads = key.size(1); diff --git a/csrc/ops.h b/csrc/ops.h index 2bcd0c2efc5c6..0f6ea5838e8fe 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -14,7 +14,8 @@ void paged_attention_v1( int block_size, int max_context_len, const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype); + const std::string& kv_cache_dtype, + float kv_scale); void paged_attention_v2( torch::Tensor& out, @@ -31,7 +32,8 @@ void paged_attention_v2( int block_size, int max_context_len, const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype); + const std::string& kv_cache_dtype, + float kv_scale); void rms_norm( torch::Tensor& out, diff --git a/csrc/quantization/fp8/amd_detail/quant_utils.cuh b/csrc/quantization/fp8/amd_detail/quant_utils.cuh index 7bc70d9264ab8..d49b67425c95a 100644 --- a/csrc/quantization/fp8/amd_detail/quant_utils.cuh +++ b/csrc/quantization/fp8/amd_detail/quant_utils.cuh @@ -17,6 +17,12 @@ __inline__ __device__ Tout vec_conversion(const Tin& x) return x; } +template +__inline__ __device__ Tout scaled_vec_conversion(const Tin& x, const float scale) +{ + return x; +} + // fp8 -> half template <> __inline__ __device__ uint16_t vec_conversion(const uint8_t& a) @@ -215,6 +221,7 @@ __inline__ __device__ float4 vec_conversion(const uint32_t& a) return res; } +// float2 -> half2 template <> __inline__ __device__ uint32_t vec_conversion(const float2& a) { @@ -227,6 +234,7 @@ __inline__ __device__ uint32_t vec_conversion(const float2& a) return uint32; } +// Float4 -> half2x2 template <> __inline__ __device__ uint2 vec_conversion(const Float4_& a) { @@ -242,6 +250,7 @@ __inline__ __device__ uint2 vec_conversion(const Float4_& a) return b; } +// Float4 -> float4 template <> __inline__ __device__ float4 vec_conversion(const Float4_& a) { @@ -253,6 +262,7 @@ __inline__ __device__ float4 vec_conversion(const Float4_& a) return b; } +// Float8 -> half2x4 template <> __inline__ __device__ uint4 vec_conversion(const Float8_& a) { @@ -264,6 +274,7 @@ __inline__ __device__ uint4 vec_conversion(const Float8_& a) return b; } +// float2 -> bfloat162 template <> __inline__ __device__ __nv_bfloat162 vec_conversion<__nv_bfloat162, float2>(const float2& a) { @@ -271,6 +282,7 @@ __inline__ __device__ __nv_bfloat162 vec_conversion<__nv_bfloat162, float2>(cons return b; } +// Float4 -> bfloat162x2 template <> __inline__ __device__ bf16_4_t vec_conversion(const Float4_& a) { @@ -280,6 +292,7 @@ __inline__ __device__ bf16_4_t vec_conversion(const Float4_& return b; } +// Float8 -> bfloat162x4 template <> __inline__ __device__ bf16_8_t vec_conversion(const Float8_& a) { @@ -290,5 +303,219 @@ __inline__ __device__ bf16_8_t vec_conversion(const Float8_& b.w = __float22bfloat162_rn(a.w); return b; } + + +/* Scaled and vectorized conversions, for data exchange between high and low precision domains + + Convention of the scale in API, e.g: FP8_data = Quantization( High_Precision_data / scale ) + s.t. + Quantize(HP / scale) => FP8 + Dequant(FP8) * scale => HP + + */ + +// fp8 -> half +template <> +__inline__ __device__ uint16_t scaled_vec_conversion(const uint8_t& a, const float scale) +{ + hip_fp8 f8{a, hip_fp8::from_bits()}; + __half_raw res; + res.data = static_cast(f8) * scale; + return res.x; +} + +// fp8x2 -> half2 +template <> +__inline__ __device__ uint32_t scaled_vec_conversion(const uint16_t& a, const float scale) +{ +#if defined(__HIP__MI300__) && defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__) + const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0); + union { + __half2_raw h2r; + uint32_t ui32; + } tmp; + tmp.h2r.x.data = f2[0] * scale; + tmp.h2r.y.data = f2[1] * scale; + return tmp.ui32; +#else + union { + uint16_t u16[2]; + uint32_t u32; + } tmp; + + tmp.u16[0] = scaled_vec_conversion(static_cast(a), scale); + tmp.u16[1] = scaled_vec_conversion(static_cast(a >> 8U), scale); + return tmp.u32; +#endif +} + +// fp8x4 -> half2x2 +template <> +__inline__ __device__ uint2 scaled_vec_conversion(const uint32_t& a, const float scale) +{ + union { + uint2 u32x2; + uint32_t u32[2]; + } tmp; + tmp.u32[0] = scaled_vec_conversion((uint16_t)a, scale); + tmp.u32[1] = scaled_vec_conversion((uint16_t)(a >> 16U), scale); + return tmp.u32x2; +} + +// fp8x8 -> half2x4 +template <> +__inline__ __device__ uint4 scaled_vec_conversion(const uint2& a, const float scale) +{ + union { + uint4 u64x2; + uint2 u64[2]; + } tmp; + tmp.u64[0] = scaled_vec_conversion(a.x, scale); + tmp.u64[1] = scaled_vec_conversion(a.y, scale); + return tmp.u64x2; +} + +using __nv_bfloat16 = __hip_bfloat16; + +// fp8 -> __nv_bfloat16 +template <> +__inline__ __device__ __nv_bfloat16 scaled_vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a, const float scale) +{ + hip_fp8 f8{a, hip_fp8::from_bits()}; + float f{f8}; + return __float2bfloat16(f * scale); +} + +using __nv_bfloat162 = __hip_bfloat162; + +// fp8x2 -> __nv_bfloat162 +template <> +__inline__ __device__ __nv_bfloat162 scaled_vec_conversion<__nv_bfloat162, uint16_t>(const uint16_t& a, const float scale) +{ + __nv_bfloat162 res; + res.x = scaled_vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)a, scale); + res.y = scaled_vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)(a >> 8U), scale); + return res; +} + +// fp8x4 -> bf16_4_t +template <> +__inline__ __device__ bf16_4_t scaled_vec_conversion(const uint32_t& a, const float scale) +{ + bf16_4_t res; + res.x = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)a, scale); + res.y = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)(a >> 16U), scale); + return res; +} + +// fp8x8 -> bf16_8_t +template <> +__inline__ __device__ bf16_8_t scaled_vec_conversion(const uint2& a, const float scale) +{ + bf16_4_t tmp1, tmp2; + tmp1 = scaled_vec_conversion(a.x, scale); + tmp2 = scaled_vec_conversion(a.y, scale); + bf16_8_t res; + res.x = tmp1.x; + res.y = tmp1.y; + res.z = tmp2.x; + res.w = tmp2.y; + return res; +} + +// fp8 -> float +template <> +__inline__ __device__ float scaled_vec_conversion(const uint8_t& a, const float scale) +{ + hip_fp8 fp8{a, hip_fp8::from_bits()}; + return static_cast(fp8) * scale; +} + +// fp8x2 -> float2 +template <> +__inline__ __device__ float2 scaled_vec_conversion(const uint16_t& a, const float scale) +{ +#if defined(__HIP__MI300__) && defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__) + float2 res; + const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0); + //res.x = vec_conversion(static_cast(a)); + //res.y = vec_conversion(static_cast(a >> 8U)); + res.x = f2[0] * scale; + res.y = f2[1] * scale; + return res; +#else + float2 res; + res.x = scaled_vec_conversion(static_cast(a), scale); + res.y = scaled_vec_conversion(static_cast(a >> 8U), scale); + return res; +#endif +} + +// fp8x4 -> float4 +template <> +__inline__ __device__ Float4_ scaled_vec_conversion(const uint32_t& a, const float scale) +{ + Float4_ res; + res.x = scaled_vec_conversion((uint16_t)a, scale); + res.y = scaled_vec_conversion((uint16_t)(a >> 16U), scale); + return res; +} + +// fp8x8 -> float8 +template <> +__inline__ __device__ Float8_ scaled_vec_conversion(const uint2& a, const float scale) +{ + Float4_ tmp1, tmp2; + tmp1 = scaled_vec_conversion(a.x, scale); + tmp2 = scaled_vec_conversion(a.y, scale); + Float8_ res; + res.x = tmp1.x; + res.y = tmp1.y; + res.z = tmp2.x; + res.w = tmp2.y; + return res; +} + + +/* Quantize(HP / scale) => FP8 */ + +// TODO(Hai): vectorized to add + +// half -> fp8 +template <> +__inline__ __device__ uint8_t scaled_vec_conversion(const uint16_t& a, const float scale) +{ + __half_raw tmp; + tmp.x = a; + + hip_fp8 f8{static_cast(tmp.data)/scale}; + return f8.data; +} + +// bf16 -> fp8 +template <> +__inline__ __device__ uint8_t scaled_vec_conversion(const __nv_bfloat16& a, const float scale) +{ + hip_fp8 res{__bfloat162float(a)/scale}; + return res.data; +} + +// float -> fp8 +template <> +__inline__ __device__ uint8_t scaled_vec_conversion(const float& a, const float scale) +{ + hip_fp8 f8(a/scale); + return f8.data; +} + +// fp8x4 -> float4 +template <> +__inline__ __device__ float4 scaled_vec_conversion(const uint32_t& a, const float scale) +{ + Float4_ tmp = scaled_vec_conversion(a, scale); + float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y); + return res; +} + } } // namespace vllm diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index c24ba13f12ee3..f7baeadbb695a 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -109,6 +109,7 @@ def forward( value_cache, input_metadata.slot_mapping.flatten(), input_metadata.kv_cache_dtype, + self.kv_cache_scaling_factor, ) if input_metadata.is_prompt: @@ -171,6 +172,7 @@ def forward( output = out.view_as(query) else: # prefix-enabled attention + # TODO(Hai) this triton kernel has regression issue with FP8 KVCache to handle mixed types output = torch.empty_like(query) context_attention_fwd( query, @@ -197,6 +199,7 @@ def forward( self.num_kv_heads, self.scale, self.alibi_slopes, + self.kv_cache_scaling_factor, ) # Reshape the output tensor. return output.view(batch_size, seq_len, hidden_size) @@ -244,6 +247,7 @@ def _paged_attention( num_kv_heads: int, scale: float, alibi_slopes: Optional[torch.Tensor], + kv_scale: float, ) -> torch.Tensor: output = torch.empty_like(query) @@ -276,6 +280,7 @@ def _paged_attention( input_metadata.max_context_len, alibi_slopes, input_metadata.kv_cache_dtype, + kv_scale, ) else: # Run PagedAttention V2. @@ -307,5 +312,6 @@ def _paged_attention( input_metadata.max_context_len, alibi_slopes, input_metadata.kv_cache_dtype, + kv_scale, ) return output From fc2cdaf455d10164128126532b911e396685c005 Mon Sep 17 00:00:00 2001 From: HaiShaw Date: Tue, 27 Feb 2024 13:25:24 -0500 Subject: [PATCH 061/159] Make KV cache scaling factors default to 1.0 instead of None --- vllm/model_executor/layers/attention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index f7baeadbb695a..2a67f4216ef1f 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -59,7 +59,7 @@ def __init__( # quantized_value * scaling_factor ~= true_value # which is consistent with the practice of setting # scaling_factor = tensor_amax / FPtype_max - self.kv_cache_scaling_factor = None + self.kv_cache_scaling_factor = 1.0 assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads From 76c6058d14f6549032b298b8131dac8b7f3775e2 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Tue, 27 Feb 2024 20:20:45 +0000 Subject: [PATCH 062/159] Update KV cache scales loader name to clarify that we are not using an iterator at present --- vllm/model_executor/models/llama.py | 4 ++-- vllm/model_executor/weight_utils.py | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 06733e4542818..3c10f3600dbca 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -45,7 +45,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.weight_utils import (default_weight_loader, hf_model_weights_iterator, - kv_cache_scales_iterator) + kv_cache_scales_loader) from vllm.sequence import SamplerOutput from vllm.config import LoRAConfig from vllm.utils import is_hip @@ -372,7 +372,7 @@ def load_kv_cache_scales(self, filename: str) -> None: self.load_dummy_kv_cache_scales() tp_size = get_tensor_model_parallel_world_size() tp_rank = get_tensor_model_parallel_rank() - for layer_idx, scaling_factor in kv_cache_scales_iterator( + for layer_idx, scaling_factor in kv_cache_scales_loader( filename, tp_rank, tp_size, self.model.config.num_hidden_layers): layer_paged_attn = self.model.layers[layer_idx].self_attn.attn diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index a8a59ca334607..af8706444cff7 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -262,7 +262,7 @@ def hf_model_weights_iterator( torch.cuda.empty_cache() -def kv_cache_scales_iterator(filename: str, +def kv_cache_scales_loader(filename: str, tp_rank: int, tp_size: int, num_hidden_layers: int) -> Iterator[Tuple[int, float]]: @@ -277,12 +277,12 @@ def kv_cache_scales_iterator(filename: str, """ try: with open(filename) as f: - # For now we do not obtain any of the benefits of iterators - # but since the number of layers = number of scales is typically - # small, this is not a concern. Loading and processing the entire - # dictionary at once allows us to do sanity checks all at once and - # avoid a situation where we have to abort after having partially - # loaded scaling factors + # Loading and processing the entire dictionary at once allows us + # to do sanity checks all at once and avoid a situation where we + # have to abort after having partially loaded scaling factors + # Since the number of layers is small and (for now) we use scalar + # scaling factors (so the size they use is also small), this is + # not a concern at present. raw_rank_map = json.load(f, parse_int=int, parse_constant=float) # If any of the inputs are malformed, it raises an error somewhere From c825bb337c63d83d949542998bd4acae56e5fe7f Mon Sep 17 00:00:00 2001 From: HaiShaw Date: Tue, 27 Feb 2024 20:59:37 -0500 Subject: [PATCH 063/159] Fix test cases from the introduction of KV cache scaling factors, using default kv_scale=1.0 --- benchmarks/kernels/benchmark_paged_attention.py | 5 +++++ tests/kernels/test_attention.py | 5 +++++ tests/kernels/test_cache.py | 5 ++++- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index ce103af3240a8..8368dc049e6a4 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -90,6 +90,9 @@ def run_benchmark(num_iters: int, profile: bool = False) -> float: torch.cuda.cudart().cudaProfilerStart() start_time = time.perf_counter() + # Using default kv_scale + kv_scale = 1.0 + for _ in range(num_iters): if version == "v1": ops.paged_attention_v1( @@ -105,6 +108,7 @@ def run_benchmark(num_iters: int, profile: bool = False) -> float: max_context_len, alibi_slopes, kv_cache_dtype, + kv_scale, ) elif version == "v2": ops.paged_attention_v2( @@ -123,6 +127,7 @@ def run_benchmark(num_iters: int, profile: bool = False) -> float: max_context_len, alibi_slopes, kv_cache_dtype, + kv_scale, ) else: raise ValueError(f"Invalid version: {version}") diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index fe50a60f71adc..ef6b3bdea625b 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -168,6 +168,9 @@ def test_paged_attention( gpu_id) key_cache, value_cache = key_caches[0], value_caches[0] + # Using default kv_scale + kv_scale = 1.0 + # Call the paged attention kernel. output = torch.empty_like(query) if version == "v1": @@ -184,6 +187,7 @@ def test_paged_attention( max_context_len, alibi_slopes, kv_cache_dtype, + kv_scale, ) elif version == "v2": num_partitions = ((max_context_len + PARTITION_SIZE - 1) // @@ -217,6 +221,7 @@ def test_paged_attention( max_context_len, alibi_slopes, kv_cache_dtype, + kv_scale, ) else: raise AssertionError(f"Unknown version: {version}") diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 6db2c81f7aeaa..9389ae1eb5f75 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -146,9 +146,12 @@ def test_reshape_and_cache( cloned_key_cache = key_cache.clone() cloned_value_cache = value_cache.clone() + # Using default kv_scale + kv_scale = 1.0 + # Call the reshape_and_cache kernel. cache_ops.reshape_and_cache(key, value, key_cache, value_cache, - slot_mapping, kv_cache_dtype) + slot_mapping, kv_cache_dtype, kv_scale) if kv_cache_dtype == "fp8": result_key_cache = torch.empty_like(key_cache, dtype=torch.float16) From 4f574cd2589ad8ea1e02cb9eb3a62d091189c82e Mon Sep 17 00:00:00 2001 From: HaiShaw Date: Thu, 29 Feb 2024 14:22:57 -0500 Subject: [PATCH 064/159] Cleanup comments according to reviews --- csrc/quantization/fp8/amd_detail/quant_utils.cuh | 2 -- 1 file changed, 2 deletions(-) diff --git a/csrc/quantization/fp8/amd_detail/quant_utils.cuh b/csrc/quantization/fp8/amd_detail/quant_utils.cuh index d49b67425c95a..5f597407e3b02 100644 --- a/csrc/quantization/fp8/amd_detail/quant_utils.cuh +++ b/csrc/quantization/fp8/amd_detail/quant_utils.cuh @@ -438,8 +438,6 @@ __inline__ __device__ float2 scaled_vec_conversion(const uint1 #if defined(__HIP__MI300__) && defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__) float2 res; const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0); - //res.x = vec_conversion(static_cast(a)); - //res.y = vec_conversion(static_cast(a >> 8U)); res.x = f2[0] * scale; res.y = f2[1] * scale; return res; From f325cb033842a1d168df9cc2797b870fac454d3b Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Mon, 5 Feb 2024 15:47:56 +0000 Subject: [PATCH 065/159] Add hip_fp8 datatype and conversions Add non-MI300 compatible alternative for bulk conversions Removed bf8 (e5m2) and renamed f8 to fp8 to explicitly specify that it is e4m3 Removed stochastic rounding for simplicity Put bulk fp8 conversion hip intrinsics behind a define. Disabled by default Using types from the proper vllm headers. Added namespace Move amd specific headers under amd_detail --- csrc/quantization/fp8/amd_detail/hip_float8.h | 167 +++++++++ .../fp8/amd_detail/hip_float8_impl.h | 316 ++++++++++++++++++ .../fp8/amd_detail/quant_utils.cuh | 293 ++++++++++++++++ 3 files changed, 776 insertions(+) create mode 100644 csrc/quantization/fp8/amd_detail/hip_float8.h create mode 100644 csrc/quantization/fp8/amd_detail/hip_float8_impl.h create mode 100644 csrc/quantization/fp8/amd_detail/quant_utils.cuh diff --git a/csrc/quantization/fp8/amd_detail/hip_float8.h b/csrc/quantization/fp8/amd_detail/hip_float8.h new file mode 100644 index 0000000000000..87c7c9ce66100 --- /dev/null +++ b/csrc/quantization/fp8/amd_detail/hip_float8.h @@ -0,0 +1,167 @@ +#pragma once + +#ifdef __HIPCC__ +#include +#else +#include +#include +#include +#include +#endif + +#include "hip_float8_impl.h" + +struct alignas(1) hip_fp8 +{ + struct from_bits_t + { + }; + HIP_FP8_HOST_DEVICE static constexpr from_bits_t from_bits() { return from_bits_t(); } + uint8_t data; + + hip_fp8() = default; + HIP_FP8_HOST_DEVICE constexpr hip_fp8(const hip_fp8&) = default; + HIP_FP8_HOST_DEVICE constexpr hip_fp8(uint8_t v) = delete; + explicit HIP_FP8_HOST_DEVICE constexpr hip_fp8(uint8_t v, from_bits_t) + : data(v) + { + } + +#ifdef __HIP__MI300__ + // NOTE: ON-DEVICE... always optimal bias + explicit HIP_FP8_DEVICE hip_fp8(float v) + : data(hip_fp8_impl::to_fp8_from_fp32(v)) + { + } + + explicit HIP_FP8_DEVICE hip_fp8(_Float16 v) + : hip_fp8(static_cast(v)) + { + } + + // Host only implementation using s/w simulation + explicit HIP_FP8_HOST +#else // __HIP__MI300__ + // both Host and DEVICE for non-MI300 using s/w simulation + explicit HIP_FP8_HOST_DEVICE +#endif // __HIP__MI300__ + hip_fp8(float v) + { + data = hip_fp8_impl::to_float8<4, 3, float, true /*negative_zero_nan*/, true /*clip*/>(v); + } + + explicit HIP_FP8_HOST_DEVICE hip_fp8(double v) + : hip_fp8(static_cast(v)) + { + } + +#ifdef __HIP__MI300__ + // upcast using device specific intrinsic + explicit inline HIP_FP8_DEVICE operator float() const + { + float fval; + uint32_t i32val = static_cast(data); + + // upcast + asm volatile("v_cvt_f32_fp8 %0, %1 src0_sel:BYTE_0" : "=v"(fval) : "v"(i32val)); + + return fval; + } + + explicit inline HIP_FP8_HOST operator float() const +#else // __HIP__MI300__ + explicit inline HIP_FP8_HOST_DEVICE operator float() const +#endif // __HIP__MI300__ + { + return hip_fp8_impl::from_float8<4, 3, float, true /*negative_zero_nan*/>(data); + } +}; + +namespace std +{ +inline hip_fp8 sin(hip_fp8 a) +{ + return hip_fp8(sinf(float(a))); +} +inline hip_fp8 cos(hip_fp8 a) +{ + return hip_fp8(cosf(float(a))); +} +HIP_FP8_HOST_DEVICE constexpr hip_fp8 real(const hip_fp8& a) +{ + return a; +} +} // namespace std + +// Special operator overloading +inline std::ostream& operator<<(std::ostream& os, const hip_fp8& f8) +{ + return os << float(f8); +} + +// all + operator overloading with mixed types +// mixed types, always converts to f32, does computation in f32, and returns float +inline HIP_FP8_HOST_DEVICE float operator+(const float fa, hip_fp8 b) +{ + return (fa + float(b)); +} + +inline HIP_FP8_HOST_DEVICE float operator+(hip_fp8 a, const float fb) +{ + return (float(a) + fb); +} + +inline HIP_FP8_HOST_DEVICE hip_fp8 operator+(hip_fp8 a, hip_fp8 b) +{ + return hip_fp8(float(a) + float(b)); +} + +inline HIP_FP8_HOST_DEVICE hip_fp8& operator+=(hip_fp8& a, hip_fp8 b) +{ + return a = hip_fp8(float(a) + float(b)); +} + +// overloading multiplication, always returns float, +inline HIP_FP8_HOST_DEVICE float operator*(hip_fp8 a, hip_fp8 b) +{ + return float(a) * float(b); +} + +inline HIP_FP8_HOST_DEVICE float operator*(float a, hip_fp8 b) +{ + return (a * float(b)); +} + +inline HIP_FP8_HOST_DEVICE float operator*(hip_fp8 a, float b) +{ + return (float(a) * b); +} + +inline HIP_FP8_HOST_DEVICE float operator*(int32_t a, hip_fp8 b) +{ + return ((float)a * float(b)); +} + +inline HIP_FP8_HOST_DEVICE float operator*(double a, hip_fp8 b) +{ + return ((float)a * float(b)); +} + +// overloading for compare +inline HIP_FP8_HOST_DEVICE bool operator==(hip_fp8 a, hip_fp8 b) +{ + return (a.data == b.data); +} +inline HIP_FP8_HOST_DEVICE bool operator!=(hip_fp8 a, hip_fp8 b) +{ + return (a.data != b.data); +} + +inline HIP_FP8_HOST_DEVICE bool operator>=(hip_fp8 a, hip_fp8 b) +{ + return static_cast(a) >= static_cast(b); +} +inline HIP_FP8_HOST_DEVICE bool operator>(hip_fp8 a, hip_fp8 b) +{ + return static_cast(a) > static_cast(b); +} diff --git a/csrc/quantization/fp8/amd_detail/hip_float8_impl.h b/csrc/quantization/fp8/amd_detail/hip_float8_impl.h new file mode 100644 index 0000000000000..c88fbd913c2ee --- /dev/null +++ b/csrc/quantization/fp8/amd_detail/hip_float8_impl.h @@ -0,0 +1,316 @@ +#pragma once + +#if defined(__HIPCC__) && (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) +#define __HIP__MI300__ +#endif + +#ifdef __HIPCC__ +#define HIP_FP8_HOST_DEVICE __host__ __device__ +#define HIP_FP8_HOST __host__ +#define HIP_FP8_DEVICE __device__ +#else +#define HIP_FP8_HOST_DEVICE +#define HIP_FP8_HOST +#define HIP_FP8_DEVICE +#endif + +namespace hip_fp8_impl +{ + +#ifdef __HIP__MI300__ +HIP_FP8_DEVICE uint8_t to_fp8_from_fp32(float v) +{ + uint8_t i8data; + union { + float fval; + uint32_t i32val; + uint8_t i8val[4]; // NOTE: not endian independent + } val; + + uint32_t ival = 0; + val.fval = v; + + if ((val.i32val & 0x7F800000) != 0x7F800000) { /// propagate NAN/INF, no clipping + val.fval = __builtin_amdgcn_fmed3f(val.fval, 240.0, -240.0); + } + + ival = __builtin_amdgcn_cvt_pk_fp8_f32(val.fval, val.fval, ival, + false); // false -> WORD0 + val.i32val = ival; + i8data = val.i8val[0]; + + return i8data; +} +#endif // __HIP__MI300__ + +HIP_FP8_HOST inline int clz(uint32_t x) +{ + return __builtin_clz(x); +} +#if defined(__HIPCC__) || defined(__CUDA_ARCH__) +HIP_FP8_DEVICE inline int clz(uint32_t x) +{ + return __clz(x); +} +#endif + +template +HIP_FP8_HOST_DEVICE uint8_t to_float8(T _x, bool stoch = false, uint32_t rng = 0) +{ +#ifdef __HIPCC__ + constexpr bool is_half = std::is_same::value; +#else + constexpr bool is_half = false; +#endif + constexpr bool is_float = std::is_same::value; + static_assert(wm + we == 7, "wm+we==7"); + static_assert(is_half || is_float, "Only half and float can be cast to f8"); + + const int mfmt = (sizeof(T) == 4) ? 23 : 10; + uint32_t x; + if (sizeof(T) == 4) { + x = reinterpret_cast(_x); + } else { + x = reinterpret_cast(_x); + } + + uint32_t head, mantissa; + int exponent, bias; + uint32_t sign; + + if (sizeof(T) == 4) { + head = x & 0xFF800000; + mantissa = x & 0x7FFFFF; + exponent = (head >> 23) & 0xFF; + sign = head >> 31; + bias = 127; + } else { + head = x & 0xFC00; + mantissa = x & 0x3FF; + exponent = (head >> 10) & 0x1F; + sign = head >> 15; + bias = 15; + } + + uint32_t signed_inf = (sign << 7) + (((1 << we) - 1) << wm); + + // Deal with inf and NaNs + if (negative_zero_nan) { + if (sizeof(T) == 4) { + if ((x & 0x7F800000) == 0x7F800000) { + return 0x80; + } + } else { + // if(__hisinf(x) || __hisnan(x)) + if ((x & 0x7C00) == 0x7C00) { + return 0x80; + } + } + } else { + if (sizeof(T) == 4) { + if ((x & 0x7F800000) == 0x7F800000) { + return signed_inf + (mantissa != 0 ? 1 : 0); + } + } else { + if ((x & 0x7C00) == 0x7C00) { + return signed_inf + (mantissa != 0 ? 1 : 0); + } + } + } + if (x == 0) { + return 0; + } + + // First need to check if it is normal or denorm as there is a difference of + // implict 1 Then need to adjust the exponent to align with the F8 exponent, + // in the meanwhile, shift The mantissa. Then for stochastic rounding, add rng + // to mantissa and truncate. And for RNE, no need to add rng. Then probably + // need to check whether there is carry and adjust exponent and mantissa again + + // For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent + // bits + const int f8_bias = (1 << (we - 1)) - 1 + (negative_zero_nan ? 1 : 0); + const int f8_denormal_act_exponent = 1 - f8_bias; // actual exponent of f8 denormal + // act_exponent is the actual exponent of fp32/fp16 (after subtracting bias) + // f8_exponent is the converted f8 exponent with bias encoding + // exponent_diff is the diff between fp32/fp16 exponent and f8 exponent, + // the difference needs to be adjusted and mantissa shifted + int act_exponent, f8_exponent, exponent_diff; + + if (exponent == 0) { // fp32/fp16 is in denormal. + /* fp32 denormal is below 2^-127 so it is usually not a concern here, we +mostly concern fp16 here. In this case, f8 is usually in denormal. But there +could be exceptions. fp16 denormal has exponent bias 15 while bf8 with NANOO has +exponent bias 16. It means that there are some numbers in fp16 denormal but they +are bf8 (NANOO) normals - smallest bf8 (NANOO) normal is 2^-15. fp16 numbers +where exponent==0 (actual exponent -14) and highest bit of mantissa is 1 are bf8 +(NANOO) normal. In this case, the fp16 mantissa should be shift left by 1 */ + act_exponent = exponent - bias + 1; + exponent_diff = f8_denormal_act_exponent - act_exponent; // actual exponent is exponent-bias+1 as it is denormal + } else { // fp32/fp16 is normal with implicit 1 + act_exponent = exponent - bias; + if (act_exponent <= f8_denormal_act_exponent) { + /* This is the case where fp32/fp16 is normal but it is in f8 denormal + range. For example fp8 nanoo mode, denormal exponent is -7, but if the + fp32/fp16 actual exponent is -7, it is actually larger due to the implict 1, + Therefore it needs to be adjust to -6 and mantissa shift right by 1. + So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */ + exponent_diff = f8_denormal_act_exponent - act_exponent; + } else { // both fp32/fp16 and f8 are in normal range + exponent_diff = 0; // exponent_diff=0 does not mean there is no difference + // for this case, + // act_exponent could be larger. Just that it does not need shift mantissa + } + mantissa += (1 << mfmt); // Add the implicit 1 into mantissa + } + + bool midpoint = (mantissa & ((1 << (mfmt - wm + exponent_diff)) - 1)) == + static_cast(1 << (mfmt - wm + exponent_diff - 1)); + /* This part is a bit tricky. The judgment of whether it is a tie needs to be + done before we shift right as shift right could rip off some residual part + and make something not midpoint look like midpoint. For example, the fp16 + number 0x1002 (0 00100 0000000010), it is larger than midpoint, but after + shift right by 4 bits, it would look like midpoint. +*/ + + if (exponent_diff > 0) { + mantissa >>= exponent_diff; + } else if (exponent_diff == -1) { + mantissa <<= -exponent_diff; + } + bool implicit_one = mantissa & (1 << mfmt); + // if there is no implict 1, it means the f8 is denormal and need to adjust + // to denorm exponent + f8_exponent = (act_exponent + exponent_diff) /*actual f8 exponent*/ + f8_bias - (implicit_one ? 0 : 1); + + // Now we have the exponent and mantissa adjusted + uint32_t drop_mask = (1 << (mfmt - wm)) - 1; + bool odd = mantissa & (1 << (mfmt - wm)); // if the least significant bit that + // is not truncated is 1 + mantissa += (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1) : mantissa)) & drop_mask; + + // Now we deal with overflow + if (f8_exponent == 0) { + if ((1 << mfmt) & mantissa) { + f8_exponent = 1; // denormal overflow to become normal, promote exponent + } + } else { + if ((1 << (mfmt + 1)) & mantissa) { + mantissa >>= 1; + f8_exponent++; + } + } + + mantissa >>= (mfmt - wm); + + // above range: quantize to maximum possible float of the same sign + const int max_exp = (1 << we) - (negative_zero_nan ? 1 : 2); + if (f8_exponent > max_exp) { + if (clip) { + mantissa = (1 << wm) - 1; + f8_exponent = max_exp; + } else { + return signed_inf; + } + } + + if (f8_exponent == 0 && mantissa == 0) { + return negative_zero_nan ? 0 : (sign << 7); + } + mantissa &= (1 << wm) - 1; + return (sign << 7) | (f8_exponent << wm) | mantissa; +} + +template +inline HIP_FP8_HOST_DEVICE T from_float8(uint8_t x) +{ +#ifdef __HIPCC__ + constexpr bool is_half = std::is_same::value; +#else + constexpr bool is_half = false; +#endif + constexpr bool is_float = std::is_same::value; + static_assert(is_half || is_float, "only half and float are supported"); + + constexpr int weo = is_half ? 5 : 8; + constexpr int wmo = is_half ? 10 : (is_float ? 23 : 7); + + T fInf, fNegInf, fNaN, fNeg0; + +#ifdef __HIPCC__ + if (is_half) { + const uint16_t ihInf = 0x7C00; + const uint16_t ihNegInf = 0xFC00; + const uint16_t ihNaN = 0x7C01; + const uint16_t ihNeg0 = 0x8000; + fInf = reinterpret_cast(ihInf); + fNegInf = reinterpret_cast(ihNegInf); + fNaN = reinterpret_cast(ihNaN); + fNeg0 = reinterpret_cast(ihNeg0); + } else +#endif + if (is_float) { + const uint32_t ifInf = 0x7F800000; + const uint32_t ifNegInf = 0xFF800000; + const uint32_t ifNaN = 0x7F800001; + const uint32_t ifNeg0 = 0x80000000; + fInf = reinterpret_cast(ifInf); + fNegInf = reinterpret_cast(ifNegInf); + fNaN = reinterpret_cast(ifNaN); + fNeg0 = reinterpret_cast(ifNeg0); + } + + if (x == 0) { + return 0; + } + + uint32_t sign = x >> 7; + uint32_t mantissa = x & ((1 << wm) - 1); + int exponent = (x & 0x7F) >> wm; + if (negative_zero_nan) { + if (x == 0x80) { + return fNaN; + } + } else { + if (x == 0x80) { + return fNeg0; + } + if (exponent == ((1 << we) - 1)) { + return (mantissa == 0) ? (sign ? fNegInf : fInf) : fNaN; + } + } + typename std::conditional::type retval; + if (we == 5 && is_half && !negative_zero_nan) { + retval = x << 8; + return reinterpret_cast(retval); + } + + const int exp_low_cutoff = (1 << (weo - 1)) - (1 << (we - 1)) + 1 - (negative_zero_nan ? 1 : 0); + + // subnormal input + if (exponent == 0) { + // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above + int sh = 1 + clz(mantissa) - (32 - wm); + mantissa <<= sh; + exponent += 1 - sh; + mantissa &= ((1 << wm) - 1); + } + exponent += exp_low_cutoff - 1; + mantissa <<= wmo - wm; + + // subnormal output (occurs when T=half, we=5, negative_zero_nan=true) + if (exponent <= 0) { + mantissa |= 1 << wmo; + mantissa >>= 1 - exponent; + exponent = 0; + } + + if (sizeof(T) == 2) { + retval = (sign << 15) | (exponent << 10) | mantissa; + } else { + retval = (sign << 31) | (exponent << 23) | mantissa; + } + return reinterpret_cast(retval); +} + +} // namespace hip_fp8_impl diff --git a/csrc/quantization/fp8/amd_detail/quant_utils.cuh b/csrc/quantization/fp8/amd_detail/quant_utils.cuh new file mode 100644 index 0000000000000..afd37cc9da0d3 --- /dev/null +++ b/csrc/quantization/fp8/amd_detail/quant_utils.cuh @@ -0,0 +1,293 @@ +#pragma once +#include "hip_float8.h" + +#include +#include +#include + +#include "../../../attention/dtype_float32.cuh" +#include "../../../attention/dtype_bfloat16.cuh" + +namespace vllm +{ + +template +__inline__ __device__ Tout vec_conversion(const Tin& x) +{ + return x; +} + +// fp8 -> half +template <> +__inline__ __device__ uint16_t vec_conversion(const uint8_t& a) +{ + hip_fp8 f8{a, hip_fp8::from_bits()}; + __half_raw res; + res.data = static_cast(f8); + return res.x; +} + +// fp8x2 -> half2 +template <> +__inline__ __device__ uint32_t vec_conversion(const uint16_t& a) +{ +#if defined(__HIP__MI300__) && defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__) + const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0); + union { + __half2_raw h2r; + uint32_t ui32; + } tmp; + tmp.h2r.x.data = f2[0]; + tmp.h2r.y.data = f2[1]; + return tmp.ui32; +#else + union { + uint16_t u16[2]; + uint32_t u32; + } tmp; + + tmp.u16[0] = vec_conversion(static_cast(a)); + tmp.u16[1] = vec_conversion(static_cast(a >> 8U)); + return tmp.u32; +#endif +} + +// fp8x4 -> half2x2 +template <> +__inline__ __device__ uint2 vec_conversion(const uint32_t& a) +{ + union { + uint2 u32x2; + uint32_t u32[2]; + } tmp; + tmp.u32[0] = vec_conversion((uint16_t)a); + tmp.u32[1] = vec_conversion((uint16_t)(a >> 16U)); + return tmp.u32x2; +} + +// fp8x8 -> half2x4 +template <> +__inline__ __device__ uint4 vec_conversion(const uint2& a) +{ + union { + uint4 u64x2; + uint2 u64[2]; + } tmp; + tmp.u64[0] = vec_conversion(a.x); + tmp.u64[1] = vec_conversion(a.y); + return tmp.u64x2; +} + +using __nv_bfloat16 = __hip_bfloat16; + +// fp8 -> __nv_bfloat16 +template <> +__inline__ __device__ __nv_bfloat16 vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a) +{ + hip_fp8 f8{a, hip_fp8::from_bits()}; + float f{f8}; + return __float2bfloat16(f); +} + +using __nv_bfloat162 = __hip_bfloat162; + +// fp8x2 -> __nv_bfloat162 +template <> +__inline__ __device__ __nv_bfloat162 vec_conversion<__nv_bfloat162, uint16_t>(const uint16_t& a) +{ + __nv_bfloat162 res; + res.x = vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)a); + res.y = vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)(a >> 8U)); + return res; +} + +// fp8x4 -> bf16_4_t +template <> +__inline__ __device__ bf16_4_t vec_conversion(const uint32_t& a) +{ + bf16_4_t res; + res.x = vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)a); + res.y = vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)(a >> 16U)); + return res; +} + +// fp8x8 -> bf16_8_t +template <> +__inline__ __device__ bf16_8_t vec_conversion(const uint2& a) +{ + bf16_4_t tmp1, tmp2; + tmp1 = vec_conversion(a.x); + tmp2 = vec_conversion(a.y); + bf16_8_t res; + res.x = tmp1.x; + res.y = tmp1.y; + res.z = tmp2.x; + res.w = tmp2.y; + return res; +} + +// fp8 -> float +template <> +__inline__ __device__ float vec_conversion(const uint8_t& a) +{ + hip_fp8 fp8{a, hip_fp8::from_bits()}; + return static_cast(fp8); +} + +// fp8x2 -> float2 +template <> +__inline__ __device__ float2 vec_conversion(const uint16_t& a) +{ +#if defined(__HIP__MI300__) && defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__) + float2 res; + const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0); + //res.x = vec_conversion(static_cast(a)); + //res.y = vec_conversion(static_cast(a >> 8U)); + res.x = f2[0]; + res.y = f2[1]; + return res; +#else + float2 res; + res.x = vec_conversion(static_cast(a)); + res.y = vec_conversion(static_cast(a >> 8U)); + return res; +#endif +} + +// fp8x4 -> float4 +template <> +__inline__ __device__ Float4_ vec_conversion(const uint32_t& a) +{ + Float4_ res; + res.x = vec_conversion((uint16_t)a); + res.y = vec_conversion((uint16_t)(a >> 16U)); + return res; +} + +// fp8x8 -> float8 +template <> +__inline__ __device__ Float8_ vec_conversion(const uint2& a) +{ + Float4_ tmp1, tmp2; + tmp1 = vec_conversion(a.x); + tmp2 = vec_conversion(a.y); + Float8_ res; + res.x = tmp1.x; + res.y = tmp1.y; + res.z = tmp2.x; + res.w = tmp2.y; + return res; +} + +// half -> fp8 +template <> +__inline__ __device__ uint8_t vec_conversion(const uint16_t& a) +{ + __half_raw tmp; + tmp.x = a; + + hip_fp8 f8{static_cast(tmp.data)}; + return f8.data; +} + +// bf16 -> fp8 +template <> +__inline__ __device__ uint8_t vec_conversion(const __nv_bfloat16& a) +{ + hip_fp8 res{__bfloat162float(a)}; + return res.data; +} + +// float -> fp8 +template <> +__inline__ __device__ uint8_t vec_conversion(const float& a) +{ + hip_fp8 f8(a); + return f8.data; +} + +// fp8x4 -> float4 +template <> +__inline__ __device__ float4 vec_conversion(const uint32_t& a) +{ + Float4_ tmp = vec_conversion(a); + float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y); + return res; +} + +template <> +__inline__ __device__ uint32_t vec_conversion(const float2& a) +{ + union { + half2 float16; + uint32_t uint32; + }; + + float16 = __float22half2_rn(a); + return uint32; +} + +template <> +__inline__ __device__ uint2 vec_conversion(const Float4_& a) +{ + uint2 b; + float2 val; + val.x = a.x.x; + val.y = a.x.y; + b.x = vec_conversion(val); + + val.x = a.y.x; + val.y = a.y.y; + b.y = vec_conversion(val); + return b; +} + +template <> +__inline__ __device__ float4 vec_conversion(const Float4_& a) +{ + float4 b; + b.x = a.x.x; + b.y = a.x.y; + b.z = a.y.x; + b.w = a.y.y; + return b; +} + +template <> +__inline__ __device__ uint4 vec_conversion(const Float8_& a) +{ + uint4 b; + b.x = vec_conversion(a.x); + b.y = vec_conversion(a.y); + b.z = vec_conversion(a.z); + b.w = vec_conversion(a.w); + return b; +} + +template <> +__inline__ __device__ __nv_bfloat162 vec_conversion<__nv_bfloat162, float2>(const float2& a) +{ + __nv_bfloat162 b = __float22bfloat162_rn(a); + return b; +} + +template <> +__inline__ __device__ bf16_4_t vec_conversion(const Float4_& a) +{ + bf16_4_t b; + b.x = __float22bfloat162_rn(a.x); + b.y = __float22bfloat162_rn(a.y); + return b; +} + +template <> +__inline__ __device__ bf16_8_t vec_conversion(const Float8_& a) +{ + bf16_8_t b; + b.x = __float22bfloat162_rn(a.x); + b.y = __float22bfloat162_rn(a.y); + b.z = __float22bfloat162_rn(a.z); + b.w = __float22bfloat162_rn(a.w); + return b; +} +} // namespace vllm From 4bb8dacf4caea8cfa1fcaa6cf16b2af9de8988ff Mon Sep 17 00:00:00 2001 From: HaiShaw Date: Mon, 5 Feb 2024 17:03:24 -0800 Subject: [PATCH 066/159] Add 3rdparty quantizer utility and usage to quantize models (HF default) --- 3rdparty/README.md | 8 ++ 3rdparty/quantizer/hf_ptq.py | 211 +++++++++++++++++++++++++++++++++++ 2 files changed, 219 insertions(+) create mode 100644 3rdparty/README.md create mode 100644 3rdparty/quantizer/hf_ptq.py diff --git a/3rdparty/README.md b/3rdparty/README.md new file mode 100644 index 0000000000000..afa2a9d4c7657 --- /dev/null +++ b/3rdparty/README.md @@ -0,0 +1,8 @@ +### quantizer utilities +`quantizer/hf_ptq.py`: Quantization utilities from AMMO and/or TensorRT-LLM, usage embedded at top + +### AMMO (AlgorithMic Model Optimization) Installation +`pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo` + +### AMMO Download +`https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.5.0.tar.gz` diff --git a/3rdparty/quantizer/hf_ptq.py b/3rdparty/quantizer/hf_ptq.py new file mode 100644 index 0000000000000..5597ba9991e49 --- /dev/null +++ b/3rdparty/quantizer/hf_ptq.py @@ -0,0 +1,211 @@ +# with AMMO installed, do below: +# python hf_ptq.py --pyt_ckpt_path="./ll2-7b" --export_path=ll2_7b_ptq_fp8 --qformat=fp8 --calib_size=128 --inference_gpus=1 +# python hf_ptq.py --pyt_ckpt_path= \ +# --export_path=llama_ptq \ +# --qformat=fp8 \ +# --calib_size=128 \ +# --inference_gpus=1 +# +# with TensorRT-LLM is installed, similarly do below: +# /dockerx/TensorRT-LLM/examples/quantization# python quantize.py --model_dir /dockerx/ll2-7b --dtype float16 --qformat fp8 --export_path /dockerx/ll2_7b_quantized_fp8 --calib_size 256 + +import argparse +import copy +import random +import time + +import numpy as np +import torch +from datasets import load_dataset +from torch.utils.data import DataLoader +from transformers import AutoModelForCausalLM, AutoTokenizer + +import ammo.torch.quantization as atq +from ammo.torch.export import export_model_config + +RAND_SEED = 1234 +MAX_SEQ_LEN = 2048 + +QUANT_CFG_CHOICES = { + "int8_sq": atq.INT8_SMOOTHQUANT_CFG, + "fp8": atq.FP8_DEFAULT_CFG, + "int4_awq": atq.INT4_AWQ_CFG, +} + +def get_calib_dataloader(data="cnn_dailymail", tokenizer=None, batch_size=1, calib_size=512, block_size=512, device=None): + print("Loading calibration dataset") + if data == "pileval": + dataset = load_dataset("json", data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", split="train") + dataset = dataset["text"][:calib_size] + elif data == "cnn_dailymail": + dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train") + dataset = dataset["article"][:calib_size] + else: + raise NotImplementedError + batch_encoded = tokenizer.batch_encode_plus( + dataset, return_tensors="pt", padding=True, truncation=True, max_length=block_size + ) + if device: + batch_encoded = batch_encoded.to(device) + batch_encoded = batch_encoded["input_ids"] + calib_dataloader = DataLoader(batch_encoded, batch_size=batch_size, shuffle=False) + return calib_dataloader + + +def get_tokenizer(ckpt_path, max_seq_len=MAX_SEQ_LEN): + print(f"Initializing tokenizer from {ckpt_path}") + tokenizer = AutoTokenizer.from_pretrained( + ckpt_path, + model_max_length=max_seq_len, + padding_side="left", + trust_remote_code=True, + ) + tokenizer.pad_token = tokenizer.eos_token + return tokenizer + + +def get_model(ckpt_path, dtype="fp16", device="cuda"): + print(f"Initializing model from {ckpt_path}") + if dtype == "bf16": + dtype = torch.bfloat16 + elif dtype == "fp16": + dtype = torch.float16 + elif dtype == "fp32": + dtype = torch.float32 + else: + raise NotImplementedError(f"Unknown dtype {dtype}") + model_kwargs = {"torch_dtype": dtype} + model = AutoModelForCausalLM.from_pretrained(ckpt_path, device_map="auto", **model_kwargs, trust_remote_code=True) + model.eval() + return model + + +def quantize_model(model, quant_cfg, calib_dataloader=None): + def calibrate_loop(): + """Adjusts weights and scaling factors based on selected algorithms.""" + for idx, data in enumerate(calib_dataloader): + print(f"Calibrating batch {idx}") + model(data) + + print("Starting quantization...") + start_time = time.time() + atq.quantize(model, quant_cfg, forward_loop=calibrate_loop) + end_time = time.time() + print(f"Quantization done. Total time used: {end_time - start_time}s") + return model + + +def _register_falcon_linears(model): + """Register Falcon linear modules as Quantiation. + + As falcon models could use remote code, which will be loaded dynamically, to build their model. + Therefore, we need to register the linear on the fly before quantization. + + """ + if type(model).__name__ in ["RWForCausalLM", "FalconForCausalLM"]: + from ammo.torch.quantization import tensor_quant + from ammo.torch.quantization.nn.modules.quant_module import QuantLinearConvBase + + linear_type = type(model.transformer.h[0].self_attention.dense) + + class QuantFalconLinearRW1B(linear_type, QuantLinearConvBase): # type: ignore + default_quant_desc_weight = tensor_quant.QUANT_DESC_8BIT_LINEAR_WEIGHT_PER_ROW + + atq.module_mapping.QUANT_MODULE_MAPPING[linear_type] = QuantFalconLinearRW1B.convert + + + +def main(args): + if not torch.cuda.is_available(): + raise EnvironmentError("GPU is required for inference.") + + random.seed(RAND_SEED) + np.random.seed(RAND_SEED) + + tokenizer = get_tokenizer(args.pyt_ckpt_path) + model = get_model(args.pyt_ckpt_path, args.dtype, args.device) + + _register_falcon_linears(model) + if args.qformat in ["fp8", "int8_sq", "int4_awq"]: + if args.qformat == "int4_awq": + if args.calib_size > 32: + calib_size = 32 + print( + f"AWQ calibration could take longer with calib_size = {args.calib_size}, Using" + f" calib_size={calib_size} instead" + ) + print( + "\nAWQ calibration could take longer than other calibration methods. Please" + " increase the batch size to speed up the calibration process. Batch size can be" + " set by adding the argument --batch_size to the command line.\n" + ) + else: + calib_size = args.calib_size + + calib_dataloader = get_calib_dataloader( + tokenizer=tokenizer, + batch_size=args.batch_size, + calib_size=calib_size, + device=args.device, + ) + if args.qformat in QUANT_CFG_CHOICES: + quant_cfg = QUANT_CFG_CHOICES[args.qformat] + else: + raise ValueError(f"Unsupported quantization format: {args.qformat}") + + if args.qformat == "int4_awq": + quant_cfg = copy.deepcopy(atq.INT4_AWQ_CFG) + quant_cfg["quant_cfg"]["*weight_quantizer"]["block_sizes"][-1] = args.awq_block_size # type: ignore + + model = quantize_model(model, quant_cfg, calib_dataloader) + else: + print(f"No quantization applied, export {args.dtype} model") + + + with torch.inference_mode(): + if any([k in type(model).__name__ for k in ["Llama", "Mistral"]]): + model_type = "llama" + elif "GPTJ" in type(model).__name__: + model_type = "gptj" + elif type(model).__name__ in ["FalconForCausalLM", "RWForCausalLM"]: + model_type = "falcon" + elif "baichuan" in type(model).__name__.lower(): + model_type = "baichuan" + elif "MPT" in type(model).__name__: + model_type = "mpt" + else: + print(f"Unknown model type {type(model).__name__}. Continue exporting...") + model_type = f"unknown:{type(model).__name__}" + + export_path = args.export_path + start_time = time.time() + export_model_config( + model, + model_type, + torch.float16, + export_dir=export_path, + inference_tensor_parallel=int(args.inference_gpus), + ) + end_time = time.time() + print( + f"Quantized model exported to :{export_path}. Total time used {end_time - start_time}s" + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--pyt_ckpt_path", help="Specify where the PyTorch checkpoint path is", required=True) + parser.add_argument("--device", default="cuda") + parser.add_argument("--dtype", help="Model data type.", default="fp16") + parser.add_argument("--qformat", help="Quantization format.", default="fp8") + parser.add_argument("--batch_size", help="Batch size for calibration.", type=int, default=1) + parser.add_argument("--calib_size", help="Number of samples for calibration.", type=int, default=512) + parser.add_argument("--export_path", default="exported_model") + parser.add_argument("--inference_gpus", default=1) + parser.add_argument("--awq_block_size", default=128) + + args = parser.parse_args() + + main(args) + + From 8b1279b9763c2c93e2812ddd88c098e26586db8a Mon Sep 17 00:00:00 2001 From: HaiShaw Date: Tue, 6 Feb 2024 17:18:09 -0800 Subject: [PATCH 067/159] Update 3rdparty quantizer utility and usage with ammo updates --- 3rdparty/README.md | 32 ++- 3rdparty/quantizer/hf_ptq.py | 211 ------------------ 3rdparty/quantizer/quantize.py | 380 +++++++++++++++++++++++++++++++++ 3 files changed, 408 insertions(+), 215 deletions(-) delete mode 100644 3rdparty/quantizer/hf_ptq.py create mode 100644 3rdparty/quantizer/quantize.py diff --git a/3rdparty/README.md b/3rdparty/README.md index afa2a9d4c7657..1ad63fabf5bed 100644 --- a/3rdparty/README.md +++ b/3rdparty/README.md @@ -1,8 +1,32 @@ -### quantizer utilities -`quantizer/hf_ptq.py`: Quantization utilities from AMMO and/or TensorRT-LLM, usage embedded at top +### Quantizer Utilities +`quantizer/quantize.py`: nVIDIA Quantization utilities using AMMO, ported from TensorRT-LLM: +`https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py` -### AMMO (AlgorithMic Model Optimization) Installation +### Prerequisite + +#### AMMO (AlgorithMic Model Optimization) Installation: nvidia-ammo 0.7.1 or later `pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo` -### AMMO Download +#### AMMO Download (code and docs) `https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.5.0.tar.gz` +`https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.7.1.tar.gz` + +### Usage + +#### Run on H100 system for speed if FP8; number of GPUs depends on the model size + +#### Example: quantize Llama2-7b model from HF to FP8 with FP8 KV Cache: +`python quantize.py --model_dir ./ll2-7b --dtype float16 --qformat fp8 --kv_cache_dtype fp8 --output_dir ./ll2_7b_fp8 --calib_size 512 --tp_size 1` + +Outputs: model structure, quantized model & parameters (with scaling factors) are in JSON and Safetensors (npz is generated only for the reference) +``` +# ll ./ll2_7b_fp8/ +total 19998244 +drwxr-xr-x 2 root root 4096 Feb 7 01:08 ./ +drwxrwxr-x 8 1060 1061 4096 Feb 7 01:08 ../ +-rw-r--r-- 1 root root 176411 Feb 7 01:08 llama_tp1.json +-rw-r--r-- 1 root root 13477087480 Feb 7 01:09 llama_tp1_rank0.npz +-rw-r--r-- 1 root root 7000893272 Feb 7 01:08 rank0.safetensors +# +``` + diff --git a/3rdparty/quantizer/hf_ptq.py b/3rdparty/quantizer/hf_ptq.py deleted file mode 100644 index 5597ba9991e49..0000000000000 --- a/3rdparty/quantizer/hf_ptq.py +++ /dev/null @@ -1,211 +0,0 @@ -# with AMMO installed, do below: -# python hf_ptq.py --pyt_ckpt_path="./ll2-7b" --export_path=ll2_7b_ptq_fp8 --qformat=fp8 --calib_size=128 --inference_gpus=1 -# python hf_ptq.py --pyt_ckpt_path= \ -# --export_path=llama_ptq \ -# --qformat=fp8 \ -# --calib_size=128 \ -# --inference_gpus=1 -# -# with TensorRT-LLM is installed, similarly do below: -# /dockerx/TensorRT-LLM/examples/quantization# python quantize.py --model_dir /dockerx/ll2-7b --dtype float16 --qformat fp8 --export_path /dockerx/ll2_7b_quantized_fp8 --calib_size 256 - -import argparse -import copy -import random -import time - -import numpy as np -import torch -from datasets import load_dataset -from torch.utils.data import DataLoader -from transformers import AutoModelForCausalLM, AutoTokenizer - -import ammo.torch.quantization as atq -from ammo.torch.export import export_model_config - -RAND_SEED = 1234 -MAX_SEQ_LEN = 2048 - -QUANT_CFG_CHOICES = { - "int8_sq": atq.INT8_SMOOTHQUANT_CFG, - "fp8": atq.FP8_DEFAULT_CFG, - "int4_awq": atq.INT4_AWQ_CFG, -} - -def get_calib_dataloader(data="cnn_dailymail", tokenizer=None, batch_size=1, calib_size=512, block_size=512, device=None): - print("Loading calibration dataset") - if data == "pileval": - dataset = load_dataset("json", data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", split="train") - dataset = dataset["text"][:calib_size] - elif data == "cnn_dailymail": - dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train") - dataset = dataset["article"][:calib_size] - else: - raise NotImplementedError - batch_encoded = tokenizer.batch_encode_plus( - dataset, return_tensors="pt", padding=True, truncation=True, max_length=block_size - ) - if device: - batch_encoded = batch_encoded.to(device) - batch_encoded = batch_encoded["input_ids"] - calib_dataloader = DataLoader(batch_encoded, batch_size=batch_size, shuffle=False) - return calib_dataloader - - -def get_tokenizer(ckpt_path, max_seq_len=MAX_SEQ_LEN): - print(f"Initializing tokenizer from {ckpt_path}") - tokenizer = AutoTokenizer.from_pretrained( - ckpt_path, - model_max_length=max_seq_len, - padding_side="left", - trust_remote_code=True, - ) - tokenizer.pad_token = tokenizer.eos_token - return tokenizer - - -def get_model(ckpt_path, dtype="fp16", device="cuda"): - print(f"Initializing model from {ckpt_path}") - if dtype == "bf16": - dtype = torch.bfloat16 - elif dtype == "fp16": - dtype = torch.float16 - elif dtype == "fp32": - dtype = torch.float32 - else: - raise NotImplementedError(f"Unknown dtype {dtype}") - model_kwargs = {"torch_dtype": dtype} - model = AutoModelForCausalLM.from_pretrained(ckpt_path, device_map="auto", **model_kwargs, trust_remote_code=True) - model.eval() - return model - - -def quantize_model(model, quant_cfg, calib_dataloader=None): - def calibrate_loop(): - """Adjusts weights and scaling factors based on selected algorithms.""" - for idx, data in enumerate(calib_dataloader): - print(f"Calibrating batch {idx}") - model(data) - - print("Starting quantization...") - start_time = time.time() - atq.quantize(model, quant_cfg, forward_loop=calibrate_loop) - end_time = time.time() - print(f"Quantization done. Total time used: {end_time - start_time}s") - return model - - -def _register_falcon_linears(model): - """Register Falcon linear modules as Quantiation. - - As falcon models could use remote code, which will be loaded dynamically, to build their model. - Therefore, we need to register the linear on the fly before quantization. - - """ - if type(model).__name__ in ["RWForCausalLM", "FalconForCausalLM"]: - from ammo.torch.quantization import tensor_quant - from ammo.torch.quantization.nn.modules.quant_module import QuantLinearConvBase - - linear_type = type(model.transformer.h[0].self_attention.dense) - - class QuantFalconLinearRW1B(linear_type, QuantLinearConvBase): # type: ignore - default_quant_desc_weight = tensor_quant.QUANT_DESC_8BIT_LINEAR_WEIGHT_PER_ROW - - atq.module_mapping.QUANT_MODULE_MAPPING[linear_type] = QuantFalconLinearRW1B.convert - - - -def main(args): - if not torch.cuda.is_available(): - raise EnvironmentError("GPU is required for inference.") - - random.seed(RAND_SEED) - np.random.seed(RAND_SEED) - - tokenizer = get_tokenizer(args.pyt_ckpt_path) - model = get_model(args.pyt_ckpt_path, args.dtype, args.device) - - _register_falcon_linears(model) - if args.qformat in ["fp8", "int8_sq", "int4_awq"]: - if args.qformat == "int4_awq": - if args.calib_size > 32: - calib_size = 32 - print( - f"AWQ calibration could take longer with calib_size = {args.calib_size}, Using" - f" calib_size={calib_size} instead" - ) - print( - "\nAWQ calibration could take longer than other calibration methods. Please" - " increase the batch size to speed up the calibration process. Batch size can be" - " set by adding the argument --batch_size to the command line.\n" - ) - else: - calib_size = args.calib_size - - calib_dataloader = get_calib_dataloader( - tokenizer=tokenizer, - batch_size=args.batch_size, - calib_size=calib_size, - device=args.device, - ) - if args.qformat in QUANT_CFG_CHOICES: - quant_cfg = QUANT_CFG_CHOICES[args.qformat] - else: - raise ValueError(f"Unsupported quantization format: {args.qformat}") - - if args.qformat == "int4_awq": - quant_cfg = copy.deepcopy(atq.INT4_AWQ_CFG) - quant_cfg["quant_cfg"]["*weight_quantizer"]["block_sizes"][-1] = args.awq_block_size # type: ignore - - model = quantize_model(model, quant_cfg, calib_dataloader) - else: - print(f"No quantization applied, export {args.dtype} model") - - - with torch.inference_mode(): - if any([k in type(model).__name__ for k in ["Llama", "Mistral"]]): - model_type = "llama" - elif "GPTJ" in type(model).__name__: - model_type = "gptj" - elif type(model).__name__ in ["FalconForCausalLM", "RWForCausalLM"]: - model_type = "falcon" - elif "baichuan" in type(model).__name__.lower(): - model_type = "baichuan" - elif "MPT" in type(model).__name__: - model_type = "mpt" - else: - print(f"Unknown model type {type(model).__name__}. Continue exporting...") - model_type = f"unknown:{type(model).__name__}" - - export_path = args.export_path - start_time = time.time() - export_model_config( - model, - model_type, - torch.float16, - export_dir=export_path, - inference_tensor_parallel=int(args.inference_gpus), - ) - end_time = time.time() - print( - f"Quantized model exported to :{export_path}. Total time used {end_time - start_time}s" - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--pyt_ckpt_path", help="Specify where the PyTorch checkpoint path is", required=True) - parser.add_argument("--device", default="cuda") - parser.add_argument("--dtype", help="Model data type.", default="fp16") - parser.add_argument("--qformat", help="Quantization format.", default="fp8") - parser.add_argument("--batch_size", help="Batch size for calibration.", type=int, default=1) - parser.add_argument("--calib_size", help="Number of samples for calibration.", type=int, default=512) - parser.add_argument("--export_path", default="exported_model") - parser.add_argument("--inference_gpus", default=1) - parser.add_argument("--awq_block_size", default=128) - - args = parser.parse_args() - - main(args) - - diff --git a/3rdparty/quantizer/quantize.py b/3rdparty/quantizer/quantize.py new file mode 100644 index 0000000000000..a68f21a89c65d --- /dev/null +++ b/3rdparty/quantizer/quantize.py @@ -0,0 +1,380 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Adapted from examples/quantization/hf_ptq.py +""" + +import argparse +import copy +import json +import random +import time + +import ammo.torch.quantization as atq +import numpy as np +import torch +from ammo.torch.export import export_model_config +from datasets import load_dataset +from torch.utils.data import DataLoader +from transformers import AutoModelForCausalLM, AutoTokenizer + +RAND_SEED = 1234 +MAX_SEQ_LEN = 2048 + +EMPTY_CFG = { + "quant_cfg": { + "*weight_quantizer": { + "enable": False, + }, + "*input_quantizer": { + "enable": False + }, + "*lm_head*": { + "enable": False + }, + "*output_layer*": { + "enable": False + }, + "default": { + "enable": False + }, + }, + "algorithm": "max", +} + +KV_CACHE_CFG = { + "*.query_key_value.output_quantizer": { + "num_bits": 8, + "axis": None, + "enable": True + }, + "*.Wqkv.output_quantizer": { + "num_bits": 8, + "axis": None, + "enable": True + }, + "*.W_pack.output_quantizer": { + "num_bits": 8, + "axis": None, + "enable": True + }, + "*.c_attn.output_quantizer": { + "num_bits": 8, + "axis": None, + "enable": True + }, + "*.k_proj.output_quantizer": { + "num_bits": 8, + "axis": None, + "enable": True + }, + "*.v_proj.output_quantizer": { + "num_bits": 8, + "axis": None, + "enable": True + }, +} + +QUANT_CFG_CHOICES = { + "int8_sq": atq.INT8_SMOOTHQUANT_CFG, + "fp8": atq.FP8_DEFAULT_CFG, + "int4_awq": atq.INT4_AWQ_CFG, + "w4a8_awq": atq.W4A8_AWQ_BETA_CFG, + "int8_wo": EMPTY_CFG, + "int4_wo": EMPTY_CFG, + "full_prec": EMPTY_CFG, +} + +MODEL_NAME_PATTERN_MAP = { + "GPT2": "gpt2", + "Xverse": "llama", + "Llama": "llama", + "Mistral": "llama", + "GPTJ": "gptj", + "FalconForCausalLM": "falcon", + "RWForCausalLM": "falcon", + "baichuan": "baichuan", + "MPT": "mpt", + "Bloom": "bloom", + "ChatGLM": "chatglm", + "QWen": "qwen", +} + + +def get_tokenizer(ckpt_path, max_seq_len=MAX_SEQ_LEN, model_type=None): + print(f"Initializing tokenizer from {ckpt_path}") + tokenizer = AutoTokenizer.from_pretrained( + ckpt_path, + model_max_length=max_seq_len, + padding_side="left", + trust_remote_code=True, + ) + if model_type and model_type == "qwen": + # qwen use token id 151643 as pad and eos tokens + tokenizer.pad_token = tokenizer.convert_ids_to_tokens(151643) + tokenizer.eos_token = tokenizer.convert_ids_to_tokens(151643) + + # can't set attribute 'pad_token' for "" + if tokenizer.pad_token != "": + tokenizer.pad_token = tokenizer.eos_token + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + assert tokenizer.pad_token is not None, f"Pad token for {model_type} cannot be set!" + + return tokenizer + + +def get_model(ckpt_path, dtype="fp16", device="cuda"): + print(f"Initializing model from {ckpt_path}") + if dtype == "bf16" or dtype == "bfloat16": + dtype = torch.bfloat16 + elif dtype == "fp16" or dtype == "float16": + dtype = torch.float16 + elif dtype == "fp32" or dtype == "float32": + dtype = torch.float32 + else: + raise NotImplementedError(f"Unknown dtype {dtype}") + + # model_kwargs = {"torch_dtype": dtype} + model_kwargs = {"torch_dtype": "auto"} + + model = AutoModelForCausalLM.from_pretrained(ckpt_path, + device_map="auto", + **model_kwargs, + trust_remote_code=True) + model.eval() + + model_dtype = next(model.parameters()).dtype + if dtype != model_dtype: + print( + f"[TensorRT-LLM][WARNING] The manually set model data type is {dtype}, " + f"but the data type of the HuggingFace model is {model_dtype}.") + + return model + + +def get_model_type(model): + for k, v in MODEL_NAME_PATTERN_MAP.items(): + if k.lower() in type(model).__name__.lower(): + return v + return None + + +def get_calib_dataloader(data="cnn_dailymail", + tokenizer=None, + batch_size=1, + calib_size=512, + block_size=512, + device=None): + print("Loading calibration dataset") + if data == "pileval": + dataset = load_dataset( + "json", + data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", + split="train") + dataset = dataset["text"][:calib_size] + elif data == "cnn_dailymail": + dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train") + dataset = dataset["article"][:calib_size] + else: + raise NotImplementedError + + batch_encoded = tokenizer.batch_encode_plus(dataset, + return_tensors="pt", + padding=True, + truncation=True, + max_length=block_size) + if device: + batch_encoded = batch_encoded.to(device) + batch_encoded = batch_encoded["input_ids"] + + calib_dataloader = DataLoader(batch_encoded, + batch_size=batch_size, + shuffle=False) + + return calib_dataloader + + +def quantize_model(model, quant_cfg, calib_dataloader=None): + + def calibrate_loop(): + if calib_dataloader is None: + return + """Adjusts weights and scaling factors based on selected algorithms.""" + for idx, data in enumerate(calib_dataloader): + print(f"Calibrating batch {idx}") + model(data) + + print("Starting quantization...") + start_time = time.time() + atq.quantize(model, quant_cfg, forward_loop=calibrate_loop) + end_time = time.time() + print("Quantization done. Total time used: {:.2f} s.".format(end_time - + start_time)) + + return model + + +def main(args): + if not torch.cuda.is_available(): + raise EnvironmentError("GPU is required for inference.") + + random.seed(RAND_SEED) + np.random.seed(RAND_SEED) + + model = get_model(args.model_dir, args.dtype, args.device) + model_type = get_model_type(model) + tokenizer = get_tokenizer(args.model_dir, model_type=model_type) + + if args.qformat in ["full_prec", "int8_wo", "int4_wo" + ] and args.kv_cache_dtype is None: + print(f"No quantization applied, export {args.dtype} model") + else: + if "awq" in args.qformat: + if args.calib_size > 32: + print( + f"AWQ calibration could take longer with calib_size = {args.calib_size}, Using" + " calib_size=32 instead") + args.calib_size = 32 + print( + "\nAWQ calibration could take longer than other calibration methods. Please" + " increase the batch size to speed up the calibration process. Batch size can be" + " set by adding the argument --batch_size to the command line.\n" + ) + + calib_dataloader = get_calib_dataloader( + tokenizer=tokenizer, + batch_size=args.batch_size, + calib_size=args.calib_size, + device=args.device, + ) + + if args.qformat in QUANT_CFG_CHOICES: + quant_cfg = QUANT_CFG_CHOICES[args.qformat] + else: + raise ValueError(f"Unsupported quantization format: {args.qformat}") + + if "awq" in args.qformat: + quant_cfg = copy.deepcopy(QUANT_CFG_CHOICES[args.qformat]) + weight_quantizer = quant_cfg["quant_cfg"][ + "*weight_quantizer"] # type: ignore + if isinstance(weight_quantizer, list): + weight_quantizer = weight_quantizer[0] + weight_quantizer["block_sizes"][-1] = args.awq_block_size + + if args.kv_cache_dtype is not None: + if args.kv_cache_dtype == "fp8": + for value in KV_CACHE_CFG.values(): + value.update({"num_bits": (4, 3)}) # type: ignore + quant_cfg["quant_cfg"].update(KV_CACHE_CFG) # type: ignore + + print(quant_cfg) + + model = quantize_model(model, quant_cfg, calib_dataloader) + + with torch.inference_mode(): + if model_type is None: + print( + f"Unknown model type {type(model).__name__}. Continue exporting..." + ) + model_type = f"unknown:{type(model).__name__}" + + export_path = args.output_dir + start_time = time.time() + + if args.qformat == "int4_awq" and model_type == "qwen": + torch.save(model.state_dict(), export_path) + else: + export_npz = (model_type not in [ + 'gptj', 'falcon', 'chatglm', 'mpt', 'llama', 'baichuan' + ]) + + # export safetensors + export_model_config(model, + model_type, + getattr(torch, args.dtype), + export_dir=export_path, + inference_tensor_parallel=args.tp_size, + inference_pipeline_parallel=args.pp_size, + # export_tensorrt_llm_config=(not export_npz), + export_tensorrt_llm_config=False, + export_npz=export_npz) + + # export npz (reference) + export_model_config(model, + model_type, + getattr(torch, args.dtype), + export_dir=export_path, + inference_tensor_parallel=args.tp_size, + inference_pipeline_parallel=args.pp_size, + # export_tensorrt_llm_config=(not export_npz), + export_tensorrt_llm_config=False, + # export_npz=export_npz, + export_npz=True) + + # Workaround for wo quantization + if args.qformat in ["int8_wo", "int4_wo", "full_prec"]: + with open(f"{export_path}/config.json", 'r') as f: + tensorrt_llm_config = json.load(f) + if args.qformat == "int8_wo": + tensorrt_llm_config["quantization"]["quant_algo"] = 'W8A16' + elif args.qformat == "int4_wo": + tensorrt_llm_config["quantization"]["quant_algo"] = 'W4A16' + else: + tensorrt_llm_config["quantization"]["quant_algo"] = None + with open(f"{export_path}/config.json", "w") as f: + json.dump(tensorrt_llm_config, f, indent=4) + + end_time = time.time() + print( + "Quantized model exported to {} \nTotal time used {:.2f} s.".format( + export_path, end_time - start_time)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--model_dir", + help="Specify where the HuggingFace model is", + required=True) + parser.add_argument("--device", default="cuda") + parser.add_argument("--dtype", help="Model data type.", default="float16") + parser.add_argument( + "--qformat", + help="Quantization format.", + default="full_prec", + choices=[ + "fp8", "int8_sq", "int4_awq", "w4a8_awq", "int8_wo", "int4_wo", + "full_prec" + ], + ) + parser.add_argument("--batch_size", + help="Batch size for calibration.", + type=int, + default=1) + parser.add_argument("--calib_size", + help="Number of samples for calibration.", + type=int, + default=512) + parser.add_argument("--output_dir", default="exported_model") + parser.add_argument("--tp_size", type=int, default=1) + parser.add_argument("--pp_size", type=int, default=1) + parser.add_argument("--awq_block_size", type=int, default=128) + parser.add_argument("--kv_cache_dtype", + help="KV Cache dtype.", + default=None, + choices=["int8", "fp8", None]) + args = parser.parse_args() + + main(args) From 9c4226e0e769c06fa2debb9ae3b2b2696d1fe237 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Wed, 7 Feb 2024 13:51:56 -0800 Subject: [PATCH 068/159] Use e4m3 and e5m2 interchangeably --- csrc/attention/dtype_fp8_e5m2.cuh | 4 ++-- csrc/cache_kernels.cu | 19 +++++++++++++++---- .../fp8/amd_detail/quant_utils.cuh | 3 ++- tests/kernels/test_cache.py | 12 +++++++----- 4 files changed, 26 insertions(+), 12 deletions(-) diff --git a/csrc/attention/dtype_fp8_e5m2.cuh b/csrc/attention/dtype_fp8_e5m2.cuh index 0580fbb8e863f..7d9e15e24237d 100644 --- a/csrc/attention/dtype_fp8_e5m2.cuh +++ b/csrc/attention/dtype_fp8_e5m2.cuh @@ -8,7 +8,7 @@ #endif namespace vllm { -#ifdef ENABLE_FP8_E5M2 +//#ifdef ENABLE_FP8_E5M2 // fp8 vector types for quantization of kv cache template<> @@ -30,6 +30,6 @@ template<> struct Vec { using Type = uint2; }; -#endif // ENABLE_FP8_E5M2 +//#endif // ENABLE_FP8_E5M2 } // namespace vllm diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 7254010b8e3a9..e993fa9031dba 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -6,6 +6,8 @@ #include "dispatch_utils.h" #ifdef ENABLE_FP8_E5M2 #include "quantization/fp8_e5m2_kvcache/quant_utils.cuh" +#else +#include "quantization/fp8/amd_detail/quant_utils.cuh" #endif #include @@ -200,7 +202,8 @@ __global__ void reshape_and_cache_kernel( key_cache[tgt_key_idx] = fp8_e5m2_unscaled::vec_conversion(tgt_key); value_cache[tgt_value_idx] = fp8_e5m2_unscaled::vec_conversion(tgt_value); #else - assert(false); + key_cache[tgt_key_idx] = fp8_e4m3::vec_conversion(tgt_key); + value_cache[tgt_value_idx] = fp8_e4m3::vec_conversion(tgt_value); #endif } else { key_cache[tgt_key_idx] = tgt_key; @@ -277,10 +280,10 @@ __global__ void convert_fp8_e5m2_kernel( const int64_t block_idx = blockIdx.x; for (int i = threadIdx.x; i < block_stride; i += blockDim.x) { int64_t idx = block_idx * block_stride + i; -#ifdef ENABLE_FP8_E5M2 + #ifdef ENABLE_FP8_E5M2 dst_cache[idx] = fp8_e5m2_unscaled::vec_conversion(src_cache[idx]); #else - assert(false); + dst_cache[idx] = fp8_e4m3::vec_conversion(src_cache[idx]); #endif } } @@ -297,13 +300,21 @@ void convert_fp8_e5m2( torch::Tensor& src_cache, torch::Tensor& dst_cache) { + torch::Device src_device = src_cache.device(); + torch::Device dst_device = dst_cache.device(); + if (src_device.is_cuda() && dst_device.is_cuda()) { + TORCH_CHECK( + src_device.index() == dst_device.index(), + "src and dst must be on the same GPU"); + } + const at::cuda::OptionalCUDAGuard device_guard(device_of(src_cache)); int64_t num_blocks = src_cache.size(0); int64_t block_stride = src_cache.stride(0); dim3 grid(num_blocks); dim3 block(std::min(block_stride, int64_t(512))); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - + if (src_cache.dtype() == at::ScalarType::Float) { CALL_CONVERT_FP8_E5M2(uint8_t, float); } else if (src_cache.dtype() == at::ScalarType::Half) { diff --git a/csrc/quantization/fp8/amd_detail/quant_utils.cuh b/csrc/quantization/fp8/amd_detail/quant_utils.cuh index afd37cc9da0d3..7bc70d9264ab8 100644 --- a/csrc/quantization/fp8/amd_detail/quant_utils.cuh +++ b/csrc/quantization/fp8/amd_detail/quant_utils.cuh @@ -10,7 +10,7 @@ namespace vllm { - +namespace fp8_e4m3 { template __inline__ __device__ Tout vec_conversion(const Tin& x) { @@ -290,4 +290,5 @@ __inline__ __device__ bf16_8_t vec_conversion(const Float8_& b.w = __float22bfloat162_rn(a.w); return b; } +} } // namespace vllm diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index d8dc74bc7b003..f70a35b1c9470 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -105,7 +105,8 @@ def test_copy_blocks( @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @torch.inference_mode() def test_reshape_and_cache( kv_cache_factory, @@ -116,7 +117,8 @@ def test_reshape_and_cache( num_blocks: int, dtype: torch.dtype, seed: int, - device: str, + device: int, + kv_cache_dtype: str, ) -> None: random.seed(seed) torch.random.manual_seed(seed) @@ -133,8 +135,8 @@ def test_reshape_and_cache( # Create the KV caches. key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1, - num_heads, head_size, dtype, - None, seed, device) + num_heads, head_size, kv_cache_dtype, + dtype, seed, gpu_id) key_cache, value_cache = key_caches[0], value_caches[0] # Clone the KV caches. @@ -143,7 +145,7 @@ def test_reshape_and_cache( # Call the reshape_and_cache kernel. cache_ops.reshape_and_cache(key, value, key_cache, value_cache, - slot_mapping, "auto") + slot_mapping, kv_cache_dtype) # Run the reference implementation. reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape) From 30bba1ce5c84d4f4b6a07535cdbc9e500f66554c Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Wed, 7 Feb 2024 14:41:42 -0800 Subject: [PATCH 069/159] Using fp8 in any cache tests that could support it --- csrc/cache_kernels.cu | 8 +++++++- tests/kernels/test_cache.py | 14 ++++++++++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index e993fa9031dba..66c6245b0a920 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -307,7 +307,13 @@ void convert_fp8_e5m2( src_device.index() == dst_device.index(), "src and dst must be on the same GPU"); } - const at::cuda::OptionalCUDAGuard device_guard(device_of(src_cache)); + at::cuda::OptionalCUDAGuard device_guard; + + if (src_device.is_cuda()) { + device_guard.set_device(src_device); + } else if (dst_device.is_cuda()) { + device_guard.set_device(dst_device); + } int64_t num_blocks = src_cache.size(0); int64_t block_stride = src_cache.stride(0); diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index f70a35b1c9470..e16c1c2a2ee1f 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -120,6 +120,8 @@ def test_reshape_and_cache( device: int, kv_cache_dtype: str, ) -> None: + if kv_cache_dtype != "auto": + return # No alternative fp8 operation to compare to random.seed(seed) torch.random.manual_seed(seed) if torch.cuda.is_available(): @@ -171,7 +173,8 @@ def test_reshape_and_cache( @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @torch.inference_mode() def test_swap_blocks( kv_cache_factory, @@ -183,8 +186,11 @@ def test_swap_blocks( num_blocks: int, dtype: torch.dtype, seed: int, - device: str, + device: int, + kv_cache_dtype: str, ) -> None: + if kv_cache_dtype == "fp8_e5m2" and "cpu" in direction: + return random.seed(seed) torch.random.manual_seed(seed) if torch.cuda.is_available(): @@ -205,12 +211,12 @@ def test_swap_blocks( # Create the KV caches on the first device. src_key_caches, src_value_caches = kv_cache_factory( - num_blocks, block_size, 1, num_heads, head_size, dtype, None, seed, + num_blocks, block_size, 1, num_heads, head_size, kv_cache_dtype, dtype, seed, src_device) # Create the KV caches on the second device. dist_key_caches, dist_value_caches = kv_cache_factory( - num_blocks, block_size, 1, num_heads, head_size, dtype, None, seed, + num_blocks, block_size, 1, num_heads, head_size, kv_cache_dtype, dtype, seed, dst_device) src_key_caches_clone = src_key_caches[0].clone() From 692f5ad4f4f21e9ec475fa3c50da1a9a6b06c97a Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Thu, 8 Feb 2024 08:48:16 -0800 Subject: [PATCH 070/159] Integrate e4m3 alongside e5m2 and adapt cache tests --- .gitignore | 1 + csrc/attention/attention_dtypes.h | 2 +- .../{dtype_fp8_e5m2.cuh => dtype_fp8.cuh} | 4 +- csrc/cache.h | 2 +- csrc/cache_kernels.cu | 36 +++++----- csrc/pybind.cpp | 8 ++- setup.py | 12 ++++ tests/kernels/test_attention.py | 8 +-- tests/kernels/test_cache.py | 65 +++++++++++++++++-- 9 files changed, 105 insertions(+), 33 deletions(-) rename csrc/attention/{dtype_fp8_e5m2.cuh => dtype_fp8.cuh} (85%) diff --git a/.gitignore b/.gitignore index b5195629e5cf3..b1513ef0ddb0c 100644 --- a/.gitignore +++ b/.gitignore @@ -181,6 +181,7 @@ _build/ # hip files generated by PyTorch *.hip *_hip* +hip_compat.h # Benchmark dataset *.json diff --git a/csrc/attention/attention_dtypes.h b/csrc/attention/attention_dtypes.h index 61748e6b1eee6..64f86381d9db9 100644 --- a/csrc/attention/attention_dtypes.h +++ b/csrc/attention/attention_dtypes.h @@ -4,4 +4,4 @@ #include "dtype_float16.cuh" #include "dtype_float32.cuh" #include "dtype_bfloat16.cuh" -#include "dtype_fp8_e5m2.cuh" +#include "dtype_fp8.cuh" diff --git a/csrc/attention/dtype_fp8_e5m2.cuh b/csrc/attention/dtype_fp8.cuh similarity index 85% rename from csrc/attention/dtype_fp8_e5m2.cuh rename to csrc/attention/dtype_fp8.cuh index 7d9e15e24237d..d11dee91ebe87 100644 --- a/csrc/attention/dtype_fp8_e5m2.cuh +++ b/csrc/attention/dtype_fp8.cuh @@ -8,7 +8,7 @@ #endif namespace vllm { -//#ifdef ENABLE_FP8_E5M2 +#if defined(ENABLE_FP8_E5M2) || defined(ENABLE_FP8_E4M3) // fp8 vector types for quantization of kv cache template<> @@ -30,6 +30,6 @@ template<> struct Vec { using Type = uint2; }; -//#endif // ENABLE_FP8_E5M2 +#endif // ENABLE_FP8_E5M2 } // namespace vllm diff --git a/csrc/cache.h b/csrc/cache.h index 765e231abd26f..7b9baa2ea97f5 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -24,6 +24,6 @@ void reshape_and_cache( const std::string& kv_cache_dtype); // Just for unittest -void convert_fp8_e5m2( +void convert_fp8( torch::Tensor& src_cache, torch::Tensor& dst_cache); diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 66c6245b0a920..2f17b54fb4998 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -4,9 +4,9 @@ #include "cuda_compat.h" #include "dispatch_utils.h" -#ifdef ENABLE_FP8_E5M2 +#if defined(ENABLE_FP8_E5M2) #include "quantization/fp8_e5m2_kvcache/quant_utils.cuh" -#else +#else if defined(ENABLE_FP8_E4M3) #include "quantization/fp8/amd_detail/quant_utils.cuh" #endif @@ -198,12 +198,14 @@ __global__ void reshape_and_cache_kernel( scalar_t tgt_key = key[src_key_idx]; scalar_t tgt_value = value[src_value_idx]; if constexpr (is_fp8_e5m2_kv_cache) { -#ifdef ENABLE_FP8_E5M2 +#if defined(ENABLE_FP8_E5M2) key_cache[tgt_key_idx] = fp8_e5m2_unscaled::vec_conversion(tgt_key); value_cache[tgt_value_idx] = fp8_e5m2_unscaled::vec_conversion(tgt_value); -#else +#elif defined(ENABLE_FP8_E4M3) key_cache[tgt_key_idx] = fp8_e4m3::vec_conversion(tgt_key); value_cache[tgt_value_idx] = fp8_e4m3::vec_conversion(tgt_value); +#else + assert(false); #endif } else { key_cache[tgt_key_idx] = tgt_key; @@ -273,30 +275,32 @@ void reshape_and_cache( namespace vllm { template -__global__ void convert_fp8_e5m2_kernel( +__global__ void convert_fp8_kernel( const Tin* __restrict__ src_cache, Tout* __restrict__ dst_cache, const int64_t block_stride) { const int64_t block_idx = blockIdx.x; for (int i = threadIdx.x; i < block_stride; i += blockDim.x) { int64_t idx = block_idx * block_stride + i; - #ifdef ENABLE_FP8_E5M2 +#if defined(ENABLE_FP8_E5M2) dst_cache[idx] = fp8_e5m2_unscaled::vec_conversion(src_cache[idx]); -#else +#elif defined(ENABLE_FP8_E4M3) dst_cache[idx] = fp8_e4m3::vec_conversion(src_cache[idx]); +#else + assert(false); #endif } } } // namespace vllm -#define CALL_CONVERT_FP8_E5M2(Tout, Tin) \ - vllm::convert_fp8_e5m2_kernel<<>>( \ +#define CALL_CONVERT_FP8(Tout, Tin) \ + vllm::convert_fp8_kernel<<>>( \ reinterpret_cast(src_cache.data_ptr()), \ reinterpret_cast(dst_cache.data_ptr()), \ block_stride); -void convert_fp8_e5m2( +void convert_fp8( torch::Tensor& src_cache, torch::Tensor& dst_cache) { @@ -322,16 +326,16 @@ void convert_fp8_e5m2( const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); if (src_cache.dtype() == at::ScalarType::Float) { - CALL_CONVERT_FP8_E5M2(uint8_t, float); + CALL_CONVERT_FP8(uint8_t, float); } else if (src_cache.dtype() == at::ScalarType::Half) { - CALL_CONVERT_FP8_E5M2(uint8_t, uint16_t); + CALL_CONVERT_FP8(uint8_t, uint16_t); } else if (src_cache.dtype() == at::ScalarType::BFloat16) { - CALL_CONVERT_FP8_E5M2(uint8_t, __nv_bfloat16); + CALL_CONVERT_FP8(uint8_t, __nv_bfloat16); } else if (dst_cache.dtype() == at::ScalarType::Float) { - CALL_CONVERT_FP8_E5M2(float, uint8_t); + CALL_CONVERT_FP8(float, uint8_t); } else if (dst_cache.dtype() == at::ScalarType::Half) { - CALL_CONVERT_FP8_E5M2(uint16_t, uint8_t); + CALL_CONVERT_FP8(uint16_t, uint8_t); } else if (dst_cache.dtype() == at::ScalarType::BFloat16) { - CALL_CONVERT_FP8_E5M2(__nv_bfloat16, uint8_t); + CALL_CONVERT_FP8(__nv_bfloat16, uint8_t); } } diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp index 4b6ade7566398..67672cbfb560d 100644 --- a/csrc/pybind.cpp +++ b/csrc/pybind.cpp @@ -82,8 +82,12 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { &reshape_and_cache, "Reshape the key and value tensors and cache them"); cache_ops.def( - "convert_fp8_e5m2", - &convert_fp8_e5m2, + "gather_cached_kv", + &gather_cached_kv, + "Gather key and value from the cache into contiguous QKV tensors"); + cache_ops.def( + "convert_fp8", + &convert_fp8, "Convert the key and value cache to fp8_e5m2 data type"); // Cuda utils diff --git a/setup.py b/setup.py index 745b5a9b2d02a..d7706acb014b1 100644 --- a/setup.py +++ b/setup.py @@ -324,6 +324,18 @@ def get_torch_arch_list() -> Set[str]: "nvcc": NVCC_FLAGS_PUNICA, }, )) +elif _is_hip(): + amd_archs = os.getenv("GPU_ARCHS") + if amd_archs is None: + amd_archs = get_amdgpu_offload_arch() + for arch in amd_archs.split(";"): + if arch not in ROCM_SUPPORTED_ARCHS: + raise RuntimeError( + f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}" + f"amdgpu_arch_found: {arch}") + NVCC_FLAGS += [f"--offload-arch={arch}"] + NVCC_FLAGS += ["-DENABLE_FP8_E4M3"] + elif _is_neuron(): neuronxcc_version = get_neuronxcc_version() diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index fb571de63d4e1..68ca9d5898983 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -232,15 +232,15 @@ def test_paged_attention( block_size, x) dequantized_key_cache = torch.empty(size=key_cache_shape, dtype=dtype, - device=device) - cache_ops.convert_fp8_e5m2(key_cache, dequantized_key_cache) + device=gpu_id) + cache_ops.convert_fp8(key_cache, dequantized_key_cache) key_cache = dequantized_key_cache value_cache_shape = value_cache.shape dequantized_value_cache = torch.empty(size=value_cache_shape, dtype=dtype, - device=device) - cache_ops.convert_fp8_e5m2(value_cache, dequantized_value_cache) + device=gpu_id) + cache_ops.convert_fp8(value_cache, dequantized_value_cache) value_cache = dequantized_value_cache ref_output = torch.empty_like(query) diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index e16c1c2a2ee1f..6874b09207c54 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -120,8 +120,6 @@ def test_reshape_and_cache( device: int, kv_cache_dtype: str, ) -> None: - if kv_cache_dtype != "auto": - return # No alternative fp8 operation to compare to random.seed(seed) torch.random.manual_seed(seed) if torch.cuda.is_available(): @@ -142,12 +140,24 @@ def test_reshape_and_cache( key_cache, value_cache = key_caches[0], value_caches[0] # Clone the KV caches. - cloned_key_cache = key_cache.clone() - cloned_value_cache = value_cache.clone() + if kv_cache_dtype == "fp8_e5m2": + cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16) + cache_ops.convert_fp8(key_cache, cloned_key_cache) + cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16) + cache_ops.convert_fp8(value_cache, cloned_value_cache) + else: + cloned_key_cache = key_cache.clone() + cloned_value_cache = value_cache.clone() # Call the reshape_and_cache kernel. cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype) + + if kv_cache_dtype == "fp8_e5m2": + result_key_cache = torch.empty_like(key_cache, dtype=torch.float16) + cache_ops.convert_fp8(key_cache, result_key_cache) + result_value_cache = torch.empty_like(value_cache, dtype=torch.float16) + cache_ops.convert_fp8(value_cache, result_value_cache) # Run the reference implementation. reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape) @@ -160,9 +170,13 @@ def test_reshape_and_cache( block_offset = block_offsets[i] cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i] cloned_value_cache[block_idx, :, :, block_offset] = value[i] - - assert torch.allclose(key_cache, cloned_key_cache) - assert torch.allclose(value_cache, cloned_value_cache) + + if kv_cache_dtype == "fp8_e5m2": + assert torch.allclose(result_key_cache, cloned_key_cache, atol=0.01, rtol=0.1) + assert torch.allclose(result_value_cache, cloned_value_cache, atol=0.01, rtol=0.1) + else: + assert torch.allclose(key_cache, cloned_key_cache) + assert torch.allclose(value_cache, cloned_value_cache) @pytest.mark.parametrize("direction", COPYING_DIRECTION) @@ -232,3 +246,40 @@ def test_swap_blocks( dist_key_caches[0][dst].cpu()) assert torch.allclose(src_value_caches_clone[src].cpu(), dist_value_caches[0][dst].cpu()) + + +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("block_size", BLOCK_SIZES) +@pytest.mark.parametrize("num_blocks", NUM_BLOCKS) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", DEVICES) +@torch.inference_mode() +def test_fp8_conversion( + num_heads: int, + head_size: int, + block_size: int, + num_blocks: int, + dtype: torch.dtype, + seed: int, + device: int, +) -> None: + random.seed(seed) + torch.random.manual_seed(seed) + torch.cuda.manual_seed(seed) + gpu_id = f"cuda:{device}" + + low = -240.0 + high = 240.0 + shape = (num_blocks, num_heads, head_size, block_size) + cache = torch.empty(shape, dtype=dtype, device=gpu_id) + cache.uniform_(low, high) + + cache_fp8 = torch.empty_like(cache, dtype=torch.uint8) + cache_ops.convert_fp8(cache, cache_fp8) + + converted_cache = torch.empty_like(cache) + cache_ops.convert_fp8(cache_fp8, converted_cache) + + assert torch.allclose(cache, converted_cache, atol=0.01, rtol=0.1) From c9321a01eefdd2b7782cb3f8ebba10c18f69dd70 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Thu, 8 Feb 2024 08:50:49 -0800 Subject: [PATCH 071/159] Add gfx942 to the arch list --- csrc/cache_kernels.cu | 7 +++---- setup.py | 2 +- vllm/utils.py | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 2f17b54fb4998..0ea66b5211e9d 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -296,8 +296,8 @@ __global__ void convert_fp8_kernel( #define CALL_CONVERT_FP8(Tout, Tin) \ vllm::convert_fp8_kernel<<>>( \ - reinterpret_cast(src_cache.data_ptr()), \ - reinterpret_cast(dst_cache.data_ptr()), \ + reinterpret_cast(src_cache.data_ptr()), \ + reinterpret_cast(dst_cache.data_ptr()), \ block_stride); void convert_fp8( @@ -312,7 +312,6 @@ void convert_fp8( "src and dst must be on the same GPU"); } at::cuda::OptionalCUDAGuard device_guard; - if (src_device.is_cuda()) { device_guard.set_device(src_device); } else if (dst_device.is_cuda()) { @@ -324,7 +323,7 @@ void convert_fp8( dim3 grid(num_blocks); dim3 block(std::min(block_stride, int64_t(512))); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - + if (src_cache.dtype() == at::ScalarType::Float) { CALL_CONVERT_FP8(uint8_t, float); } else if (src_cache.dtype() == at::ScalarType::Half) { diff --git a/setup.py b/setup.py index d7706acb014b1..d5eb125ec7ae4 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ # Supported NVIDIA GPU architectures. NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"} -ROCM_SUPPORTED_ARCHS = {"gfx908", "gfx90a", "gfx942", "gfx1100"} +ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx942", "gfx1030", "gfx1100"} # SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS) diff --git a/vllm/utils.py b/vllm/utils.py index a4f9bfe6aac99..945b8f32fd3c5 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -239,7 +239,7 @@ def _generate_random_fp8_e5m2( from vllm._C import cache_ops tensor_tmp = torch.empty_like(tensor, dtype=torch.float16) tensor_tmp.uniform_(low, high) - cache_ops.convert_fp8_e5m2(tensor_tmp, tensor) + cache_ops.convert_fp8(tensor_tmp, tensor) del tensor_tmp From 4e1f89a727addf61408e93c39c49ef638120b069 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Thu, 8 Feb 2024 10:44:08 -0800 Subject: [PATCH 072/159] Less forgiving atol in fp8 tests --- tests/kernels/test_cache.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 6874b09207c54..caca39a4519a1 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -172,8 +172,8 @@ def test_reshape_and_cache( cloned_value_cache[block_idx, :, :, block_offset] = value[i] if kv_cache_dtype == "fp8_e5m2": - assert torch.allclose(result_key_cache, cloned_key_cache, atol=0.01, rtol=0.1) - assert torch.allclose(result_value_cache, cloned_value_cache, atol=0.01, rtol=0.1) + assert torch.allclose(result_key_cache, cloned_key_cache, atol=0.001, rtol=0.1) + assert torch.allclose(result_value_cache, cloned_value_cache, atol=0.001, rtol=0.1) else: assert torch.allclose(key_cache, cloned_key_cache) assert torch.allclose(value_cache, cloned_value_cache) @@ -282,4 +282,4 @@ def test_fp8_conversion( converted_cache = torch.empty_like(cache) cache_ops.convert_fp8(cache_fp8, converted_cache) - assert torch.allclose(cache, converted_cache, atol=0.01, rtol=0.1) + assert torch.allclose(cache, converted_cache, atol=0.001, rtol=0.1) From 7bc25742c37a11498a71f44c0a88bc920e045e53 Mon Sep 17 00:00:00 2001 From: AdrianAbeyta Date: Fri, 9 Feb 2024 02:36:53 +0000 Subject: [PATCH 073/159] enable fp8-e4m3 kv cache on rocm --- csrc/attention/attention_kernels.cu | 18 ++++++++++++++---- csrc/cache_kernels.cu | 2 +- vllm/config.py | 10 ++++++++++ vllm/engine/arg_utils.py | 4 ++-- vllm/utils.py | 3 ++- 5 files changed, 29 insertions(+), 8 deletions(-) diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index b5be3befa07e2..27b542fedcb7e 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -29,6 +29,10 @@ #include "../quantization/fp8_e5m2_kvcache/quant_utils.cuh" #endif +#ifdef ENABLE_FP8_E4M3 +#include "../quantization/fp8/amd_detail/quant_utils.cuh" +#endif + #include #ifndef USE_ROCM @@ -150,7 +154,7 @@ __device__ void paged_attention_kernel( constexpr int VEC_SIZE = MAX(16 / (THREAD_GROUP_SIZE * sizeof(scalar_t)), 1); using K_vec = typename Vec::Type; using Q_vec = typename Vec::Type; -#ifdef ENABLE_FP8_E5M2 +#if defined(ENABLE_FP8_E5M2) || defined(ENABLE_FP8_E4M3) using Quant_vec = typename Vec::Type; #endif @@ -221,6 +225,9 @@ __device__ void paged_attention_kernel( Quant_vec k_vec_quant = *reinterpret_cast(k_ptr + offset1 * BLOCK_SIZE * x + offset2); // Vector conversion from Quant_vec to K_vec. k_vecs[j] = fp8_e5m2_unscaled::vec_conversion(k_vec_quant); +#elif defined(ENABLE_FP8_E4M3) + Quant_vec k_vec_quant = *reinterpret_cast(k_ptr + offset1 * BLOCK_SIZE * x + offset2); + k_vecs[j] = fp8_e4m3::vec_conversion(k_vec_quant); #else assert(false); #endif @@ -300,7 +307,7 @@ __device__ void paged_attention_kernel( constexpr int V_VEC_SIZE = MIN(16 / sizeof(scalar_t), BLOCK_SIZE); using V_vec = typename Vec::Type; using L_vec = typename Vec::Type; -#ifdef ENABLE_FP8_E5M2 +#if defined(ENABLE_FP8_E5M2) || defined(ENABLE_FP8_E4M3) using V_quant_vec = typename Vec::Type; #endif using Float_L_vec = typename FloatVec::Type; @@ -341,6 +348,9 @@ __device__ void paged_attention_kernel( V_quant_vec v_quant_vec = *reinterpret_cast(v_ptr + offset); // Vector conversion from V_quant_vec to V_vec. v_vec = fp8_e5m2_unscaled::vec_conversion(v_quant_vec); +#elif defined(ENABLE_FP8_E4M3) + V_quant_vec v_quant_vec = *reinterpret_cast(v_ptr + offset); + v_vec = fp8_e4m3::vec_conversion(v_quant_vec); #else assert(false); #endif @@ -739,7 +749,7 @@ void paged_attention_v1( } else { TORCH_CHECK(false, "Unsupported data type: ", query.dtype()); } - } else if (kv_cache_dtype == "fp8_e5m2") { + } else if (kv_cache_dtype == "fp8_e5m2" or "fp8_e4m3") { if (query.dtype() == at::ScalarType::Float) { CALL_V1_LAUNCHER_BLOCK_SIZE(float, uint8_t, true); } else if (query.dtype() == at::ScalarType::Half) { @@ -932,7 +942,7 @@ void paged_attention_v2( } else { TORCH_CHECK(false, "Unsupported data type: ", query.dtype()); } - } else if (kv_cache_dtype == "fp8_e5m2") { + } else if (kv_cache_dtype == "fp8_e5m2" or "fp8_e4m3") { if (query.dtype() == at::ScalarType::Float) { CALL_V2_LAUNCHER_BLOCK_SIZE(float, uint8_t, true); } else if (query.dtype() == at::ScalarType::Half) { diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 0ea66b5211e9d..0a7ae3ae1e7d6 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -259,7 +259,7 @@ void reshape_and_cache( } else if (key.dtype() == at::ScalarType::BFloat16) { CALL_RESHAPE_AND_CACHE(__nv_bfloat16, __nv_bfloat16, false); } - } else if (kv_cache_dtype == "fp8_e5m2") { + } else if (kv_cache_dtype == "fp8_e5m2" or "fp8_e4m3") { if (key.dtype() == at::ScalarType::Float) { CALL_RESHAPE_AND_CACHE(float, uint8_t, true); } else if (key.dtype() == at::ScalarType::Half) { diff --git a/vllm/config.py b/vllm/config.py index b4d48d34a8a72..8f4e980577d46 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -345,6 +345,16 @@ def _verify_cache_dtype(self) -> None: "But it may cause slight accuracy drop. " "Currently we only support fp8 without scaling factors and " "make e5m2 as a default format.") + elif self.cache_dtype == "fp8_e4m3": + device_name = torch.cuda.get_device_name() + if not "AMD" in device_name: + raise NotImplementedError( + "FP8_E4M3 KV Cache on NVIDIA GPU has not been supported yet.") + logger.info( + "Using fp8_e4m3 data type to store kv cache. It reduces " + "the GPU memory footprint and boosts the performance. " + "But it may cause slight accuracy drop. " + "Currently we only support fp8 without scaling factors") else: raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}") diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c01e7311fb89a..449d6fbae1375 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -135,8 +135,8 @@ def add_cli_args( parser.add_argument( '--kv-cache-dtype', type=str, - choices=['auto', 'fp8_e5m2'], - default=EngineArgs.kv_cache_dtype, + choices=['auto', 'fp8_e5m2', 'fp8_e4m3'], + default='auto', help='Data type for kv cache storage. If "auto", will use model ' 'data type. Note FP8 is not supported when cuda version is ' 'lower than 11.8.') diff --git a/vllm/utils.py b/vllm/utils.py index 945b8f32fd3c5..61228d96e7fc2 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -29,6 +29,7 @@ "bfloat16": torch.bfloat16, "float": torch.float, "fp8_e5m2": torch.uint8, + "fp8_e4m3": torch.uint8, } @@ -268,7 +269,7 @@ def create_kv_caches_with_random( raise ValueError(f"Invalid model dtype: {model_dtype}") elif cache_dtype in ["half", "bfloat16", "float"]: torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype] - elif cache_dtype == "fp8_e5m2": + elif cache_dtype == "fp8_e5m2" or "fp8_e4m3": torch_dtype = torch.uint8 else: raise ValueError(f"Invalid kv cache dtype: {cache_dtype}") From ebf7542f117f9aaa6285983ccc7a3aaf9e1797c1 Mon Sep 17 00:00:00 2001 From: AdrianAbeyta Date: Fri, 9 Feb 2024 17:09:48 +0000 Subject: [PATCH 074/159] Address naming conventions --- csrc/attention/attention_kernels.cu | 64 +++++++++++++++-------------- csrc/cache_kernels.cu | 6 +-- vllm/engine/arg_utils.py | 7 ++-- 3 files changed, 41 insertions(+), 36 deletions(-) diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index 27b542fedcb7e..0719ec7d796fb 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -25,16 +25,20 @@ #include "attention_dtypes.h" #include "attention_utils.cuh" -#ifdef ENABLE_FP8_E5M2 -#include "../quantization/fp8_e5m2_kvcache/quant_utils.cuh" -#endif -#ifdef ENABLE_FP8_E4M3 +#if defined(ENABLE_FP8_E5M2) +#include "../quantization/fp8_e5m2_kvcache/quant_utils.cuh" +#elif defined(ENABLE_FP8_E4M3) #include "../quantization/fp8/amd_detail/quant_utils.cuh" #endif #include +#ifdef USE_ROCM + #include + typedef __hip_bfloat16 __nv_bfloat16; +#endif + #ifndef USE_ROCM #define WARP_SIZE 32 #else @@ -90,7 +94,7 @@ template< int HEAD_SIZE, int BLOCK_SIZE, int NUM_THREADS, - bool IS_FP8_E5M2_KV_CACHE, + bool IS_FP8_KV_CACHE, int PARTITION_SIZE = 0> // Zero means no partitioning. __device__ void paged_attention_kernel( float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] @@ -220,8 +224,8 @@ __device__ void paged_attention_kernel( const int vec_idx = thread_group_offset + j * THREAD_GROUP_SIZE; const int offset1 = (vec_idx * VEC_SIZE) / x; const int offset2 = (vec_idx * VEC_SIZE) % x; - if constexpr (IS_FP8_E5M2_KV_CACHE) { -#ifdef ENABLE_FP8_E5M2 + if constexpr (IS_FP8_KV_CACHE) { +#if defined(ENABLE_FP8_E5M2) Quant_vec k_vec_quant = *reinterpret_cast(k_ptr + offset1 * BLOCK_SIZE * x + offset2); // Vector conversion from Quant_vec to K_vec. k_vecs[j] = fp8_e5m2_unscaled::vec_conversion(k_vec_quant); @@ -343,8 +347,8 @@ __device__ void paged_attention_kernel( if (row_idx < HEAD_SIZE) { const int offset = row_idx * BLOCK_SIZE + physical_block_offset; V_vec v_vec; - if constexpr (IS_FP8_E5M2_KV_CACHE) { -#ifdef ENABLE_FP8_E5M2 + if constexpr (IS_FP8_KV_CACHE) { +#if defined(ENABLE_FP8_E5M2) V_quant_vec v_quant_vec = *reinterpret_cast(v_ptr + offset); // Vector conversion from V_quant_vec to V_vec. v_vec = fp8_e5m2_unscaled::vec_conversion(v_quant_vec); @@ -441,7 +445,7 @@ template< int HEAD_SIZE, int BLOCK_SIZE, int NUM_THREADS, - bool IS_FP8_E5M2_KV_CACHE> + bool IS_FP8_KV_CACHE> __global__ void paged_attention_v1_kernel( scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size] const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] @@ -456,7 +460,7 @@ __global__ void paged_attention_v1_kernel( const int q_stride, const int kv_block_stride, const int kv_head_stride) { - paged_attention_kernel( + paged_attention_kernel( /* exp_sums */ nullptr, /* max_logits */ nullptr, out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, context_lens, max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride); @@ -469,7 +473,7 @@ template< int HEAD_SIZE, int BLOCK_SIZE, int NUM_THREADS, - bool IS_FP8_E5M2_KV_CACHE, + bool IS_FP8_KV_CACHE, int PARTITION_SIZE> __global__ void paged_attention_v2_kernel( float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions] @@ -487,7 +491,7 @@ __global__ void paged_attention_v2_kernel( const int q_stride, const int kv_block_stride, const int kv_head_stride) { - paged_attention_kernel( + paged_attention_kernel( exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, context_lens, max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride); @@ -597,9 +601,9 @@ __global__ void paged_attention_v2_reduce_kernel( #define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \ VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \ ((void*)vllm::paged_attention_v1_kernel), shared_mem_size); \ + IS_FP8_KV_CACHE>), shared_mem_size); \ vllm::paged_attention_v1_kernel<<>>( \ + IS_FP8_KV_CACHE><<>>( \ out_ptr, \ query_ptr, \ key_cache_ptr, \ @@ -619,7 +623,7 @@ template< typename T, typename CACHE_T, int BLOCK_SIZE, - bool IS_FP8_E5M2_KV_CACHE, + bool IS_FP8_KV_CACHE, int NUM_THREADS = 128> void paged_attention_v1_launcher( torch::Tensor& out, @@ -695,8 +699,8 @@ void paged_attention_v1_launcher( } } -#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_E5M2_KV_CACHE) \ - paged_attention_v1_launcher( \ +#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ + paged_attention_v1_launcher( \ out, \ query, \ key_cache, \ @@ -710,16 +714,16 @@ void paged_attention_v1_launcher( // NOTE(woosuk): To reduce the compilation time, we omitted block sizes // 1, 2, 4, 64, 128, 256. -#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_E5M2_KV_CACHE) \ +#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_KV_CACHE) \ switch (block_size) { \ case 8: \ - CALL_V1_LAUNCHER(T, CACHE_T, 8, IS_FP8_E5M2_KV_CACHE); \ + CALL_V1_LAUNCHER(T, CACHE_T, 8, IS_FP8_KV_CACHE); \ break; \ case 16: \ - CALL_V1_LAUNCHER(T, CACHE_T, 16, IS_FP8_E5M2_KV_CACHE); \ + CALL_V1_LAUNCHER(T, CACHE_T, 16, IS_FP8_KV_CACHE); \ break; \ case 32: \ - CALL_V1_LAUNCHER(T, CACHE_T, 32, IS_FP8_E5M2_KV_CACHE); \ + CALL_V1_LAUNCHER(T, CACHE_T, 32, IS_FP8_KV_CACHE); \ break; \ default: \ TORCH_CHECK(false, "Unsupported block size: ", block_size); \ @@ -766,7 +770,7 @@ void paged_attention_v1( #define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \ vllm::paged_attention_v2_kernel \ + IS_FP8_KV_CACHE, PARTITION_SIZE> \ <<>>( \ exp_sums_ptr, \ max_logits_ptr, \ @@ -796,7 +800,7 @@ template< typename T, typename CACHE_T, int BLOCK_SIZE, - bool IS_FP8_E5M2_KV_CACHE, + bool IS_FP8_KV_CACHE, int NUM_THREADS = 128, int PARTITION_SIZE = 512> void paged_attention_v2_launcher( @@ -882,8 +886,8 @@ void paged_attention_v2_launcher( } } -#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_E5M2_KV_CACHE) \ - paged_attention_v2_launcher( \ +#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ + paged_attention_v2_launcher( \ out, \ exp_sums, \ max_logits, \ @@ -900,16 +904,16 @@ void paged_attention_v2_launcher( // NOTE(woosuk): To reduce the compilation time, we omitted block sizes // 1, 2, 4, 64, 128, 256. -#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_E5M2_KV_CACHE) \ +#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_KV_CACHE) \ switch (block_size) { \ case 8: \ - CALL_V2_LAUNCHER(T, CACHE_T, 8, IS_FP8_E5M2_KV_CACHE); \ + CALL_V2_LAUNCHER(T, CACHE_T, 8, IS_FP8_KV_CACHE); \ break; \ case 16: \ - CALL_V2_LAUNCHER(T, CACHE_T, 16, IS_FP8_E5M2_KV_CACHE); \ + CALL_V2_LAUNCHER(T, CACHE_T, 16, IS_FP8_KV_CACHE); \ break; \ case 32: \ - CALL_V2_LAUNCHER(T, CACHE_T, 32, IS_FP8_E5M2_KV_CACHE); \ + CALL_V2_LAUNCHER(T, CACHE_T, 32, IS_FP8_KV_CACHE); \ break; \ default: \ TORCH_CHECK(false, "Unsupported block size: ", block_size); \ diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 0a7ae3ae1e7d6..36a6d1cc4027d 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -6,7 +6,7 @@ #include "dispatch_utils.h" #if defined(ENABLE_FP8_E5M2) #include "quantization/fp8_e5m2_kvcache/quant_utils.cuh" -#else if defined(ENABLE_FP8_E4M3) +#elif defined(ENABLE_FP8_E4M3) #include "quantization/fp8/amd_detail/quant_utils.cuh" #endif @@ -216,8 +216,8 @@ __global__ void reshape_and_cache_kernel( } // namespace vllm -#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, IS_FP8_E5M2_KV_CACHE) \ - vllm::reshape_and_cache_kernel<<>>( \ +#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, IS_FP8_KV_CACHE) \ + vllm::reshape_and_cache_kernel<<>>( \ reinterpret_cast(key.data_ptr()), \ reinterpret_cast(value.data_ptr()), \ reinterpret_cast(key_cache.data_ptr()), \ diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 449d6fbae1375..c737e50b0b4c6 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -137,9 +137,10 @@ def add_cli_args( type=str, choices=['auto', 'fp8_e5m2', 'fp8_e4m3'], default='auto', - help='Data type for kv cache storage. If "auto", will use model ' - 'data type. Note FP8 is not supported when cuda version is ' - 'lower than 11.8.') + help='Data type for kv cache storage. If "auto", will use model data ' + 'type. FP8_E5M2 is only supported on cuda version greater than 11.8. ' + 'On AMD GPUs, only a more standard FP8_E4M3 is supported for inference. ') + parser.add_argument('--max-model-len', type=int, default=EngineArgs.max_model_len, From ad440555254afd88422b0ae3756de8daeb014e96 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Fri, 9 Feb 2024 15:34:40 +0000 Subject: [PATCH 075/159] Fix or comparisons --- csrc/cache_kernels.cu | 2 +- vllm/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 36a6d1cc4027d..b1f6053408028 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -259,7 +259,7 @@ void reshape_and_cache( } else if (key.dtype() == at::ScalarType::BFloat16) { CALL_RESHAPE_AND_CACHE(__nv_bfloat16, __nv_bfloat16, false); } - } else if (kv_cache_dtype == "fp8_e5m2" or "fp8_e4m3") { + } else if (kv_cache_dtype == "fp8_e5m2" || kv_cache_dtype == "fp8_e4m3") { if (key.dtype() == at::ScalarType::Float) { CALL_RESHAPE_AND_CACHE(float, uint8_t, true); } else if (key.dtype() == at::ScalarType::Half) { diff --git a/vllm/utils.py b/vllm/utils.py index 61228d96e7fc2..40b105aec08c4 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -269,7 +269,7 @@ def create_kv_caches_with_random( raise ValueError(f"Invalid model dtype: {model_dtype}") elif cache_dtype in ["half", "bfloat16", "float"]: torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype] - elif cache_dtype == "fp8_e5m2" or "fp8_e4m3": + elif cache_dtype in ["fp8_e5m2", "fp8_e4m3"]: torch_dtype = torch.uint8 else: raise ValueError(f"Invalid kv cache dtype: {cache_dtype}") From e5e0e7cc73f03b5ce08e019e581e11f6a8b1ec14 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Fri, 9 Feb 2024 15:30:59 +0000 Subject: [PATCH 076/159] Rename remaining fp8_e5m2 to general fp8 Reduce fp8 range in the conversion test to match e4m3 Add other MI300 architectures to the list Simplify device guard use in conversion kernel --- benchmarks/benchmark_latency.py | 2 +- benchmarks/benchmark_throughput.py | 2 +- .../kernels/benchmark_paged_attention.py | 2 +- csrc/attention/attention_kernels.cu | 4 +-- csrc/cache_kernels.cu | 20 ++++++-------- csrc/pybind.cpp | 2 +- ...fp8_e5m2_kv_cache.rst => fp8_kv_cache.rst} | 4 +-- setup.py | 2 +- tests/kernels/test_attention.py | 7 ++--- tests/kernels/test_cache.py | 18 ++++++------- vllm/config.py | 19 ++++++-------- vllm/engine/arg_utils.py | 2 +- vllm/utils.py | 26 ++++++++++++------- 13 files changed, 55 insertions(+), 55 deletions(-) rename docs/source/quantization/{fp8_e5m2_kv_cache.rst => fp8_kv_cache.rst} (92%) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 6e3b679cb81b2..1dcba2ff388e8 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -125,7 +125,7 @@ def run_to_completion(profile_dir: Optional[str] = None): parser.add_argument( "--kv-cache-dtype", type=str, - choices=['auto', 'fp8_e5m2'], + choices=['auto', 'fp8'], default='auto', help= 'Data type for kv cache storage. If "auto", will use model data type.') diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 1ad502526c97c..ddaee7aaee19f 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -292,7 +292,7 @@ def main(args: argparse.Namespace): parser.add_argument( "--kv-cache-dtype", type=str, - choices=["auto", "fp8_e5m2"], + choices=["auto", "fp8"], default="auto", help= 'Data type for kv cache storage. If "auto", will use model data type.') diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index d921dea1220e1..de81a0adffe76 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -179,7 +179,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: parser.add_argument( "--kv-cache-dtype", type=str, - choices=["auto", "fp8_e5m2"], + choices=["auto", "fp8"], default="auto", help= 'Data type for kv cache storage. If "auto", will use model data type.') diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index 0719ec7d796fb..463d817af142c 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -753,7 +753,7 @@ void paged_attention_v1( } else { TORCH_CHECK(false, "Unsupported data type: ", query.dtype()); } - } else if (kv_cache_dtype == "fp8_e5m2" or "fp8_e4m3") { + } else if (kv_cache_dtype == "fp8") { if (query.dtype() == at::ScalarType::Float) { CALL_V1_LAUNCHER_BLOCK_SIZE(float, uint8_t, true); } else if (query.dtype() == at::ScalarType::Half) { @@ -946,7 +946,7 @@ void paged_attention_v2( } else { TORCH_CHECK(false, "Unsupported data type: ", query.dtype()); } - } else if (kv_cache_dtype == "fp8_e5m2" or "fp8_e4m3") { + } else if (kv_cache_dtype == "fp8") { if (query.dtype() == at::ScalarType::Float) { CALL_V2_LAUNCHER_BLOCK_SIZE(float, uint8_t, true); } else if (query.dtype() == at::ScalarType::Half) { diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index b1f6053408028..57c4c325531a6 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -259,7 +259,7 @@ void reshape_and_cache( } else if (key.dtype() == at::ScalarType::BFloat16) { CALL_RESHAPE_AND_CACHE(__nv_bfloat16, __nv_bfloat16, false); } - } else if (kv_cache_dtype == "fp8_e5m2" || kv_cache_dtype == "fp8_e4m3") { + } else if (kv_cache_dtype == "fp8") { if (key.dtype() == at::ScalarType::Float) { CALL_RESHAPE_AND_CACHE(float, uint8_t, true); } else if (key.dtype() == at::ScalarType::Half) { @@ -306,17 +306,13 @@ void convert_fp8( { torch::Device src_device = src_cache.device(); torch::Device dst_device = dst_cache.device(); - if (src_device.is_cuda() && dst_device.is_cuda()) { - TORCH_CHECK( - src_device.index() == dst_device.index(), - "src and dst must be on the same GPU"); - } - at::cuda::OptionalCUDAGuard device_guard; - if (src_device.is_cuda()) { - device_guard.set_device(src_device); - } else if (dst_device.is_cuda()) { - device_guard.set_device(dst_device); - } + TORCH_CHECK(src_device.is_cuda(), "src must be on a GPU") + TORCH_CHECK(dst_device.is_cuda(), "dst must be on a GPU") + TORCH_CHECK( + src_device.index() == dst_device.index(), + "src and dst must be on the same GPU"); + at::cuda::OptionalCUDAGuard device_guard(src_device); + int64_t num_blocks = src_cache.size(0); int64_t block_stride = src_cache.stride(0); diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp index 67672cbfb560d..ea8f100d12f55 100644 --- a/csrc/pybind.cpp +++ b/csrc/pybind.cpp @@ -88,7 +88,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { cache_ops.def( "convert_fp8", &convert_fp8, - "Convert the key and value cache to fp8_e5m2 data type"); + "Convert the key and value cache to fp8 data type"); // Cuda utils pybind11::module cuda_utils = m.def_submodule("cuda_utils", "vLLM cuda utils"); diff --git a/docs/source/quantization/fp8_e5m2_kv_cache.rst b/docs/source/quantization/fp8_kv_cache.rst similarity index 92% rename from docs/source/quantization/fp8_e5m2_kv_cache.rst rename to docs/source/quantization/fp8_kv_cache.rst index f1eeb59550952..99d619a119e35 100644 --- a/docs/source/quantization/fp8_e5m2_kv_cache.rst +++ b/docs/source/quantization/fp8_kv_cache.rst @@ -1,4 +1,4 @@ -.. _fp8_e5m2_kv_cache: +.. _fp8_kv_cache: FP8 E5M2 KV Cache ================== @@ -21,7 +21,7 @@ Here is an example of how to enable this feature: # Create a sampling params object. sampling_params = SamplingParams(temperature=0.8, top_p=0.95) # Create an LLM. - llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8_e5m2") + llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8") # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/setup.py b/setup.py index d5eb125ec7ae4..78e93cae920ef 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ # Supported NVIDIA GPU architectures. NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"} -ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx942", "gfx1030", "gfx1100"} +ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx940", "gfx941", "gfx942", "gfx1030", "gfx1100"} # SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS) diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 68ca9d5898983..3b9079a74e621 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -33,7 +33,7 @@ BLOCK_SIZES = [16, 32] USE_ALIBI = [False, True] -KV_CACHE_DTYPE = ["auto", "fp8_e5m2"] +KV_CACHE_DTYPE = ["auto", "fp8"] SEEDS = [0] CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) @@ -225,7 +225,7 @@ def test_paged_attention( raise AssertionError(f"Unknown version: {version}") # Run the reference implementation. - if kv_cache_dtype == "fp8_e5m2": + if kv_cache_dtype == "fp8": # Convert cache data back to dtype. x = 16 // torch.tensor([], dtype=dtype).element_size() key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x, @@ -264,7 +264,8 @@ def test_paged_attention( # NOTE(zhaoyang): FP8 KV Cache will introduce quantization error, # so we use a relaxed tolerance for the test. - if kv_cache_dtype == "fp8_e5m2": + atol, rtol = 1e-3, 1e-5 + if kv_cache_dtype == "fp8": atol, rtol = 1e-2, 1e-5 assert torch.allclose(output, ref_output, atol=atol, rtol=rtol) diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index caca39a4519a1..f69b44cc8706e 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -21,10 +21,8 @@ ] # Arbitrary values for testing NUM_MAPPINGS = [256] # Arbitrary values for testing SEEDS = [0] -CUDA_DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] -KV_CACHE_DTYPE = ["auto", "fp8_e5m2"] +DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)] +KV_CACHE_DTYPE = ["auto", "fp8"] @pytest.mark.parametrize("num_mappings", NUM_MAPPINGS) @@ -140,7 +138,7 @@ def test_reshape_and_cache( key_cache, value_cache = key_caches[0], value_caches[0] # Clone the KV caches. - if kv_cache_dtype == "fp8_e5m2": + if kv_cache_dtype == "fp8": cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16) cache_ops.convert_fp8(key_cache, cloned_key_cache) cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16) @@ -153,7 +151,7 @@ def test_reshape_and_cache( cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype) - if kv_cache_dtype == "fp8_e5m2": + if kv_cache_dtype == "fp8": result_key_cache = torch.empty_like(key_cache, dtype=torch.float16) cache_ops.convert_fp8(key_cache, result_key_cache) result_value_cache = torch.empty_like(value_cache, dtype=torch.float16) @@ -171,7 +169,7 @@ def test_reshape_and_cache( cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i] cloned_value_cache[block_idx, :, :, block_offset] = value[i] - if kv_cache_dtype == "fp8_e5m2": + if kv_cache_dtype == "fp8": assert torch.allclose(result_key_cache, cloned_key_cache, atol=0.001, rtol=0.1) assert torch.allclose(result_value_cache, cloned_value_cache, atol=0.001, rtol=0.1) else: @@ -203,7 +201,7 @@ def test_swap_blocks( device: int, kv_cache_dtype: str, ) -> None: - if kv_cache_dtype == "fp8_e5m2" and "cpu" in direction: + if kv_cache_dtype == "fp8" and "cpu" in direction: return random.seed(seed) torch.random.manual_seed(seed) @@ -270,8 +268,8 @@ def test_fp8_conversion( torch.cuda.manual_seed(seed) gpu_id = f"cuda:{device}" - low = -240.0 - high = 240.0 + low = -224.0 + high = 224.0 shape = (num_blocks, num_heads, head_size, block_size) cache = torch.empty(shape, dtype=dtype, device=gpu_id) cache.uniform_(low, high) diff --git a/vllm/config.py b/vllm/config.py index 8f4e980577d46..569a5f3ca5516 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -329,18 +329,15 @@ def _verify_args(self) -> None: def _verify_cache_dtype(self) -> None: if self.cache_dtype == "auto": pass - elif self.cache_dtype == "fp8_e5m2": - nvcc_cuda_version = get_nvcc_cuda_version() - if nvcc_cuda_version and nvcc_cuda_version < Version("11.8"): - raise ValueError( - "FP8 is not supported when cuda version is lower than 11.8." - ) - device_name = torch.cuda.get_device_name() - if "AMD" in device_name: - raise NotImplementedError( - "FP8_E5M2 KV Cache on AMD GPU has not been supported yet.") + elif self.cache_dtype == "fp8": + if not is_hip(): + nvcc_cuda_version = get_nvcc_cuda_version() + if nvcc_cuda_version < Version("11.8"): + raise ValueError( + "FP8 is not supported when cuda version is lower than 11.8." + ) logger.info( - "Using fp8_e5m2 data type to store kv cache. It reduces " + "Using fp8 data type to store kv cache. It reduces " "the GPU memory footprint and boosts the performance. " "But it may cause slight accuracy drop. " "Currently we only support fp8 without scaling factors and " diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c737e50b0b4c6..4ff78e610ec81 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -135,7 +135,7 @@ def add_cli_args( parser.add_argument( '--kv-cache-dtype', type=str, - choices=['auto', 'fp8_e5m2', 'fp8_e4m3'], + choices=['auto', 'fp8'], default='auto', help='Data type for kv cache storage. If "auto", will use model data ' 'type. FP8_E5M2 is only supported on cuda version greater than 11.8. ' diff --git a/vllm/utils.py b/vllm/utils.py index 40b105aec08c4..46151b7e7f742 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -28,8 +28,12 @@ "half": torch.half, "bfloat16": torch.bfloat16, "float": torch.float, +<<<<<<< HEAD "fp8_e5m2": torch.uint8, "fp8_e4m3": torch.uint8, +======= + "fp8": torch.uint8, +>>>>>>> Rename remaining fp8_e5m2 to general fp8 } @@ -224,7 +228,7 @@ def get_nvcc_cuda_version() -> Optional[Version]: return nvcc_cuda_version -def _generate_random_fp8_e5m2( +def _generate_random_fp8( tensor: torch.tensor, low: float, high: float, @@ -269,7 +273,11 @@ def create_kv_caches_with_random( raise ValueError(f"Invalid model dtype: {model_dtype}") elif cache_dtype in ["half", "bfloat16", "float"]: torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype] +<<<<<<< HEAD elif cache_dtype in ["fp8_e5m2", "fp8_e4m3"]: +======= + elif cache_dtype == "fp8": +>>>>>>> Rename remaining fp8_e5m2 to general fp8 torch_dtype = torch.uint8 else: raise ValueError(f"Invalid kv cache dtype: {cache_dtype}") @@ -286,13 +294,13 @@ def create_kv_caches_with_random( key_cache = torch.empty(size=key_cache_shape, dtype=torch_dtype, device=device) - if cache_dtype == 'fp8_e5m2': - _generate_random_fp8_e5m2(key_cache, -scale, scale) - elif torch_dtype in [torch.half, torch.bfloat16, torch.float]: + if cache_dtype in ["auto", "half", "bfloat16", "float"]: key_cache.uniform_(-scale, scale) + elif cache_dtype == 'fp8': + _generate_random_fp8(key_cache, -scale, scale) else: - raise ValueError( - f"Does not support key cache of type {cache_dtype}") + raise ValueError( + f"Does not support key cache of type {cache_dtype}") key_caches.append(key_cache) value_cache_shape = (num_blocks, num_heads, head_size, block_size) @@ -301,10 +309,10 @@ def create_kv_caches_with_random( value_cache = torch.empty(size=value_cache_shape, dtype=torch_dtype, device=device) - if cache_dtype == 'fp8_e5m2': - _generate_random_fp8_e5m2(value_cache, -scale, scale) - elif torch_dtype in [torch.half, torch.bfloat16, torch.float]: + if cache_dtype in ["auto", "half", "bfloat16", "float"]: value_cache.uniform_(-scale, scale) + elif cache_dtype == 'fp8': + _generate_random_fp8(value_cache, -scale, scale) else: raise ValueError( f"Does not support value cache of type {cache_dtype}") From c86b2ec0980fb4a7efb561c0df2c57bbbf44931e Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Fri, 9 Feb 2024 15:48:50 +0000 Subject: [PATCH 077/159] Add e4m3 to attention kernels --- csrc/attention/attention_kernels.cu | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index 463d817af142c..173b21ed89fc8 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -230,8 +230,9 @@ __device__ void paged_attention_kernel( // Vector conversion from Quant_vec to K_vec. k_vecs[j] = fp8_e5m2_unscaled::vec_conversion(k_vec_quant); #elif defined(ENABLE_FP8_E4M3) - Quant_vec k_vec_quant = *reinterpret_cast(k_ptr + offset1 * BLOCK_SIZE * x + offset2); - k_vecs[j] = fp8_e4m3::vec_conversion(k_vec_quant); + Quant_vec k_vec_quant = *reinterpret_cast(k_ptr + offset1 * BLOCK_SIZE * x + offset2); + // Vector conversion from Quant_vec to K_vec. + k_vecs[j] = fp8_e4m3::vec_conversion(k_vec_quant); #else assert(false); #endif @@ -353,8 +354,10 @@ __device__ void paged_attention_kernel( // Vector conversion from V_quant_vec to V_vec. v_vec = fp8_e5m2_unscaled::vec_conversion(v_quant_vec); #elif defined(ENABLE_FP8_E4M3) - V_quant_vec v_quant_vec = *reinterpret_cast(v_ptr + offset); - v_vec = fp8_e4m3::vec_conversion(v_quant_vec); + V_quant_vec v_quant_vec = *reinterpret_cast(v_ptr + offset); + // Vector conversion from V_quant_vec to V_vec. + v_vec = fp8_e4m3::vec_conversion(v_quant_vec); +>>>>>>> Add e4m3 to attention kernels #else assert(false); #endif From a432815ee7812e66886e07241a1bac5fd3584ce7 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Fri, 9 Feb 2024 16:15:56 +0000 Subject: [PATCH 078/159] Remove remaining mentions of e5m2 where it refers to general fp8 --- csrc/cache_kernels.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 57c4c325531a6..3e231b414b377 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -153,7 +153,7 @@ void copy_blocks( namespace vllm { -template +template __global__ void reshape_and_cache_kernel( const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size] const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size] @@ -197,7 +197,7 @@ __global__ void reshape_and_cache_kernel( + block_offset; scalar_t tgt_key = key[src_key_idx]; scalar_t tgt_value = value[src_value_idx]; - if constexpr (is_fp8_e5m2_kv_cache) { + if constexpr (is_fp8_kv_cache) { #if defined(ENABLE_FP8_E5M2) key_cache[tgt_key_idx] = fp8_e5m2_unscaled::vec_conversion(tgt_key); value_cache[tgt_value_idx] = fp8_e5m2_unscaled::vec_conversion(tgt_value); From 4b7712676658df362e043109f765a43306c446aa Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Fri, 9 Feb 2024 18:42:43 +0000 Subject: [PATCH 079/159] Updated fp8 help text in additional files sililar to arg_utils --- benchmarks/benchmark_latency.py | 5 +++-- benchmarks/benchmark_throughput.py | 5 +++-- benchmarks/kernels/benchmark_paged_attention.py | 5 +++-- vllm/engine/arg_utils.py | 1 - 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 1dcba2ff388e8..d382db5e1892c 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -127,8 +127,9 @@ def run_to_completion(profile_dir: Optional[str] = None): type=str, choices=['auto', 'fp8'], default='auto', - help= - 'Data type for kv cache storage. If "auto", will use model data type.') + help='Data type for kv cache storage. If "auto", will use model data ' + 'type. FP8_E5M2 is only supported on cuda version greater than 11.8. ' + 'On AMD GPUs, only the more standard FP8_E4M3 is supported for inference.') parser.add_argument( '--profile', action='store_true', diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index ddaee7aaee19f..e5dfd463ea131 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -294,8 +294,9 @@ def main(args: argparse.Namespace): type=str, choices=["auto", "fp8"], default="auto", - help= - 'Data type for kv cache storage. If "auto", will use model data type.') + help='Data type for kv cache storage. If "auto", will use model data ' + 'type. FP8_E5M2 is only supported on cuda version greater than 11.8. ' + 'On AMD GPUs, only the more standard FP8_E4M3 is supported for inference.') parser.add_argument( "--device", type=str, diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index de81a0adffe76..5c0eff4e4a73d 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -181,8 +181,9 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: type=str, choices=["auto", "fp8"], default="auto", - help= - 'Data type for kv cache storage. If "auto", will use model data type.') + help='Data type for kv cache storage. If "auto", will use model data ' + 'type. FP8_E5M2 is only supported on cuda version greater than 11.8. ' + 'On AMD GPUs, only the more standard FP8_E4M3 is supported for inference.') parser.add_argument("--device", type=str, choices=["cuda"], default="cuda") args = parser.parse_args() print(args) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 4ff78e610ec81..41e3b3596c73d 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -140,7 +140,6 @@ def add_cli_args( help='Data type for kv cache storage. If "auto", will use model data ' 'type. FP8_E5M2 is only supported on cuda version greater than 11.8. ' 'On AMD GPUs, only a more standard FP8_E4M3 is supported for inference. ') - parser.add_argument('--max-model-len', type=int, default=EngineArgs.max_model_len, From bbf6d491125da4e04e75a33a943c38592fe184a8 Mon Sep 17 00:00:00 2001 From: AdrianAbeyta Date: Fri, 9 Feb 2024 19:48:04 +0000 Subject: [PATCH 080/159] generalize fp8 convention --- vllm/config.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 569a5f3ca5516..e19fafb777191 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -340,18 +340,8 @@ def _verify_cache_dtype(self) -> None: "Using fp8 data type to store kv cache. It reduces " "the GPU memory footprint and boosts the performance. " "But it may cause slight accuracy drop. " - "Currently we only support fp8 without scaling factors and " - "make e5m2 as a default format.") - elif self.cache_dtype == "fp8_e4m3": - device_name = torch.cuda.get_device_name() - if not "AMD" in device_name: - raise NotImplementedError( - "FP8_E4M3 KV Cache on NVIDIA GPU has not been supported yet.") - logger.info( - "Using fp8_e4m3 data type to store kv cache. It reduces " - "the GPU memory footprint and boosts the performance. " - "But it may cause slight accuracy drop. " - "Currently we only support fp8 without scaling factors") + "FP8_E5M2 is only supported on cuda version greater than 11.8." + "On AMD GPUs, only a more standard FP8_E4M3 is supported for inference.") else: raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}") From 4dfb26df52b91840cd24ba629bdf0d16d5e32ca5 Mon Sep 17 00:00:00 2001 From: HaiShaw Date: Fri, 9 Feb 2024 16:18:48 -0800 Subject: [PATCH 081/159] Update log info and args description w.r.t. FP8 KV cache. --- benchmarks/benchmark_latency.py | 6 +++--- benchmarks/benchmark_throughput.py | 6 +++--- benchmarks/kernels/benchmark_paged_attention.py | 7 +++---- vllm/config.py | 6 +++--- vllm/engine/arg_utils.py | 6 +++--- 5 files changed, 15 insertions(+), 16 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index d382db5e1892c..b88772f49ccc7 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -127,9 +127,9 @@ def run_to_completion(profile_dir: Optional[str] = None): type=str, choices=['auto', 'fp8'], default='auto', - help='Data type for kv cache storage. If "auto", will use model data ' - 'type. FP8_E5M2 is only supported on cuda version greater than 11.8. ' - 'On AMD GPUs, only the more standard FP8_E4M3 is supported for inference.') + help='Data type for kv cache storage. If "auto", will use model data type. ' + 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' + 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') parser.add_argument( '--profile', action='store_true', diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index e5dfd463ea131..73ff77719f781 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -294,9 +294,9 @@ def main(args: argparse.Namespace): type=str, choices=["auto", "fp8"], default="auto", - help='Data type for kv cache storage. If "auto", will use model data ' - 'type. FP8_E5M2 is only supported on cuda version greater than 11.8. ' - 'On AMD GPUs, only the more standard FP8_E4M3 is supported for inference.') + help='Data type for kv cache storage. If "auto", will use model data type. ' + 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' + 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') parser.add_argument( "--device", type=str, diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 5c0eff4e4a73d..575ca95e772c4 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -181,10 +181,9 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: type=str, choices=["auto", "fp8"], default="auto", - help='Data type for kv cache storage. If "auto", will use model data ' - 'type. FP8_E5M2 is only supported on cuda version greater than 11.8. ' - 'On AMD GPUs, only the more standard FP8_E4M3 is supported for inference.') - parser.add_argument("--device", type=str, choices=["cuda"], default="cuda") + help='Data type for kv cache storage. If "auto", will use model data type. ' + 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' + 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') args = parser.parse_args() print(args) diff --git a/vllm/config.py b/vllm/config.py index e19fafb777191..93f9dce1d7d36 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -339,9 +339,9 @@ def _verify_cache_dtype(self) -> None: logger.info( "Using fp8 data type to store kv cache. It reduces " "the GPU memory footprint and boosts the performance. " - "But it may cause slight accuracy drop. " - "FP8_E5M2 is only supported on cuda version greater than 11.8." - "On AMD GPUs, only a more standard FP8_E4M3 is supported for inference.") + "But it may cause slight accuracy drop without scaling factors. " + "FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8." + "On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.") else: raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}") diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 41e3b3596c73d..b40a573187379 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -137,9 +137,9 @@ def add_cli_args( type=str, choices=['auto', 'fp8'], default='auto', - help='Data type for kv cache storage. If "auto", will use model data ' - 'type. FP8_E5M2 is only supported on cuda version greater than 11.8. ' - 'On AMD GPUs, only a more standard FP8_E4M3 is supported for inference. ') + help='Data type for kv cache storage. If "auto", will use model data type. ' + 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' + 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') parser.add_argument('--max-model-len', type=int, default=EngineArgs.max_model_len, From 0f492f55bc46df8a7bf80227fe39359c1fe9cee2 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Mon, 12 Feb 2024 22:31:09 +0000 Subject: [PATCH 082/159] Initial skeleton; scaling factors in CacheEngine and PagedAttention --- 3rdparty/quantizer/extract_scales.py | 82 ++++++ vllm/config.py | 7 + vllm/engine/arg_utils.py | 11 + vllm/engine/llm_engine.py | 1 + vllm/model_executor/layers/attention.py | 8 +- vllm/model_executor/models/baichuan.py | 7 +- vllm/model_executor/models/bloom.py | 7 +- vllm/model_executor/models/chatglm.py | 5 +- vllm/model_executor/models/deepseek.py | 7 +- vllm/model_executor/models/falcon.py | 7 +- vllm/model_executor/models/gemma.py | 7 +- vllm/model_executor/models/gpt2.py | 5 +- vllm/model_executor/models/gpt_bigcode.py | 5 +- vllm/model_executor/models/gpt_j.py | 7 +- vllm/model_executor/models/gpt_neox.py | 7 +- vllm/model_executor/models/internlm.py | 300 ++++++++++++++++++++ vllm/model_executor/models/internlm2.py | 7 +- vllm/model_executor/models/llama.py | 7 +- vllm/model_executor/models/mixtral.py | 7 +- vllm/model_executor/models/mixtral_quant.py | 7 +- vllm/model_executor/models/mpt.py | 7 +- vllm/model_executor/models/opt.py | 6 +- vllm/model_executor/models/orion.py | 7 +- vllm/model_executor/models/phi.py | 7 +- vllm/model_executor/models/qwen.py | 7 +- vllm/model_executor/models/qwen2.py | 7 +- vllm/model_executor/models/stablelm.py | 7 +- vllm/model_executor/models/starcoder2.py | 7 +- vllm/model_executor/weight_utils.py | 16 ++ vllm/worker/cache_engine.py | 58 +++- vllm/worker/model_runner.py | 8 +- 31 files changed, 560 insertions(+), 78 deletions(-) create mode 100644 3rdparty/quantizer/extract_scales.py create mode 100644 vllm/model_executor/models/internlm.py diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py new file mode 100644 index 0000000000000..ace05e52a69b8 --- /dev/null +++ b/3rdparty/quantizer/extract_scales.py @@ -0,0 +1,82 @@ +import argparse +import json +import os +from vllm.model_executor.weight_utils import ( + hf_model_weights_iterator, + prepare_hf_model_weights +) + +default_output_name = "kv_cache_scales.json" + + +def main(args): + layer_scale_factors_map = {} + if args.output is None: + hf_folder, _, _ = prepare_hf_model_weights(args.model, + args.cache_dir, + args.load_format, + revision=args.revision) + output_file = os.path.join(hf_folder, default_output_name) + else: + output_file = os.path.join(args.output, default_output_name) + if not os.path.isdir(args.output): + os.makedirs(args.output, exist_ok=True) + + for name, param in hf_model_weights_iterator(args.model, + args.cache_dir, + args.load_format, + args.revision): + if "kv_cache_scaling_factor" in name: + nums = [int(s) for s in name.split('.') if s.isdigit()] + assert len(nums) == 1, f"Could not determine layer idx for {name}!" + layer_idx = nums[0] + assert layer_idx not in layer_scale_factors_map, f"Duplicate scaling " \ + f"factor corresponding to layer {layer_idx}!" + try: + layer_scale_factors_map[layer_idx] = param.item() + except RuntimeError: + print("This utility supports only per-tensor scalar scale factors " + f"for now. The tensor\n {name} = {param} is an invalid " + "scale factor!") + raise + if len(layer_scale_factors_map) == 0: + print("WARNING: No KV cache scale factors found! No output saved.") + else: + with open(output_file, 'w') as f: + json.dump(layer_scale_factors_map, f, sort_keys=True) + print(f"Completed! KV cache scaling factors saved to {output_file}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="This simple utility extracts the " + "KV cache scaling factors from a quantized HF model " + "and saves them to a JSON file compatible with later " + "use by vLLM (pass this file to the appropriate " + "runtime typically using the argument " + "--kv-cache-scales ). This is only used " + "if the KV cache dtype is FP8.") + parser.add_argument("--model", + help="Specify either a directory or name of a HF model. If the model " + "does not exist, this utility will attempt to download said model " + "from the HF repo.", + required=True) + parser.add_argument("--cache_dir", + help="Optionally specify a cache directory to use for a HF model " + "download.", + default=None) + parser.add_argument("--load_format", + help="Optionally specify the format of the model's tensor files " + "containing the KV cache scaling factors.", + choices=["auto", "safetensors", "pt", "npcache"], + default="auto") + parser.add_argument("--revision", + help="Optionally specify the model's revision number.", + default=None) + parser.add_argument("--output", + help="Specify the output directory. By default it will be saved in " + f"the model directory with the filename {default_output_name}, " + "however you can override this behavior here.", + default=None) + args = parser.parse_args() + + main(args) diff --git a/vllm/config.py b/vllm/config.py index 93f9dce1d7d36..0fdb27a81c49e 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -302,6 +302,7 @@ def __init__( gpu_memory_utilization: float, swap_space: int, cache_dtype: str, + kv_cache_scales: Optional[str] = None, sliding_window: Optional[int] = None, ) -> None: self.block_size = block_size @@ -309,6 +310,7 @@ def __init__( self.swap_space_bytes = swap_space * _GB self.cache_dtype = cache_dtype self.sliding_window = sliding_window + self.kv_cache_scales = kv_cache_scales self._verify_args() self._verify_cache_dtype() @@ -330,6 +332,11 @@ def _verify_cache_dtype(self) -> None: if self.cache_dtype == "auto": pass elif self.cache_dtype == "fp8": + if self.kv_cache_scales is None: + logger.warn(f"Using cache dtype {self.cache_dtype} but no " + "scaling factors provided. Defaulting to 1.0 " + "scales, be warned that this might lead to " + "inaccurate results!") if not is_hip(): nvcc_cuda_version = get_nvcc_cuda_version() if nvcc_cuda_version < Version("11.8"): diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b40a573187379..2ae7ae57e4a40 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -18,6 +18,7 @@ class EngineArgs: load_format: str = 'auto' dtype: str = 'auto' kv_cache_dtype: str = 'auto' + kv_cache_scales: str = None seed: int = 0 max_model_len: Optional[int] = None worker_use_ray: bool = False @@ -140,6 +141,15 @@ def add_cli_args( help='Data type for kv cache storage. If "auto", will use model data type. ' 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') + parser.add_argument( + '--kv-cache-scales', + type=str, + default=None, + help='Path to the JSON file containing the KV cache scaling factors. ' + 'This should generally be supplied when KV cache dtype is FP8. Otherwise ' + 'the KV cache scaling factors default to 1.0, which will likely cause ' + 'accuracy issues. Note FP8 is not supported when cuda version is ' + 'lower than 11.8.') parser.add_argument('--max-model-len', type=int, default=EngineArgs.max_model_len, @@ -293,6 +303,7 @@ def create_engine_configs( cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, + self.kv_cache_scales, model_config.get_sliding_window()) parallel_config = ParallelConfig(self.pipeline_parallel_size, self.tensor_parallel_size, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index df4858a696530..eb246c7abe3cd 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -101,6 +101,7 @@ def __init__( f"quantization={model_config.quantization}, " f"enforce_eager={model_config.enforce_eager}, " f"kv_cache_dtype={cache_config.cache_dtype}, " + f"kv_cache_scales={cache_config.kv_cache_scales}, " f"device_config={device_config.device}, " f"seed={model_config.seed})") # TODO(woosuk): Print more configs in debug mode. diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index 2a82325b80213..61d4e9d172c6a 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -100,6 +100,7 @@ def forward( value: torch.Tensor, key_cache: Optional[torch.Tensor], value_cache: Optional[torch.Tensor], + kv_cache_scaling_factor: Optional[torch.Tensor], input_metadata: InputMetadata, ) -> torch.Tensor: """PagedAttention forward pass. @@ -121,12 +122,17 @@ def forward( query = query.view(-1, self.num_heads, self.head_size) key = key.view(-1, self.num_kv_heads, self.head_size) value = value.view(-1, self.num_kv_heads, self.head_size) - + # Reshape the keys and values and store them in the cache. # If key_cache and value_cache are not provided, the new key and value # vectors will not be cached. This happens during the initial memory # profiling run. if key_cache is not None and value_cache is not None: + if kv_cache_scaling_factor is not None: + # Scale the key and value scaling factors for quantization + # by cache ops + key = key.div_(kv_cache_scaling_factor) + value = value.div_(kv_cache_scaling_factor) cache_ops.reshape_and_cache( key, value, diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 550dec6487f9e..cde371d0a7b7f 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -44,7 +44,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: @@ -177,8 +177,9 @@ def forward( q, k, v = qkv.chunk(chunks=3, dim=-1) if self.postion_embedding != "ALIBI": q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 4adfb6b78102f..f9954849bc081 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -40,7 +40,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: @@ -122,8 +122,9 @@ def forward( del position_ids # Unused. qkv, _ = self.query_key_value(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.dense(attn_output) return output diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index dca8d724f976b..3fc7cafaa006f 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -28,7 +28,7 @@ from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs import ChatGLMConfig -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class GLMAttention(nn.Module): @@ -104,13 +104,14 @@ def forward( qkv, _ = self.query_key_value(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(position_ids, q, k) - key_cache, value_cache = kv_cache + key_cache, value_cache, kv_cache_scaling_factor = kv_cache context_layer = self.attn( q, k, v, key_cache, value_cache, + kv_cache_scaling_factor, input_metadata, ) attn_output, _ = self.dense(context_layer) diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index 6dba952736921..1cba98e625cc6 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -50,7 +50,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class DeepseekMLP(nn.Module): @@ -244,8 +244,9 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 2b5e022312e3b..e7e8271e01c5b 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -47,7 +47,7 @@ from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs import RWConfig -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] FalconConfig = Union[HF_FalconConfig, RWConfig] @@ -185,8 +185,9 @@ def forward( q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) if self.use_rotary: q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) attn_output, bias = self.dense(attn_output) return attn_output, bias diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 03948132d32c3..34cecb9f97bd7 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -40,7 +40,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class GemmaMLP(nn.Module): @@ -138,8 +138,9 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 661da0fe0434e..6fcb13f177113 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -40,7 +40,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class GPT2Attention(nn.Module): @@ -85,8 +85,9 @@ def forward( ) -> torch.Tensor: qkv, _ = self.c_attn(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) - key_cache, value_cache = kv_cache + key_cache, value_cache, kv_cache_scaling_factor = kv_cache attn_output = self.attn(q, k, v, key_cache, value_cache, + kv_cache_scaling_factor, input_metadata) attn_output, _ = self.c_proj(attn_output) return attn_output diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index ef4c1d4143c88..79993d938f571 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -41,7 +41,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class GPTBigCodeAttention(nn.Module): @@ -104,8 +104,9 @@ def forward( ], dim=-1, ) - key_cache, value_cache = kv_cache + key_cache, value_cache, kv_cache_scaling_factor = kv_cache attn_output = self.attn(q, k, v, key_cache, value_cache, + kv_cache_scaling_factor, input_metadata) attn_output, _ = self.c_proj(attn_output) return attn_output diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 5bab30d9d442e..642a53562662a 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -40,7 +40,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class GPTJAttention(nn.Module): @@ -98,8 +98,9 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) q, k = self.rotary_emb(position_ids, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) attn_output, _ = self.out_proj(attn_output) return attn_output diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 8f7e1063e0c1d..1ffb4aa6fc8df 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -40,7 +40,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class GPTNeoXAttention(nn.Module): @@ -99,8 +99,9 @@ def forward( qkv, _ = self.query_key_value(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) q, k = self.rotary_emb(position_ids, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.dense(attn_output) return output diff --git a/vllm/model_executor/models/internlm.py b/vllm/model_executor/models/internlm.py new file mode 100644 index 0000000000000..0ba01243e4eac --- /dev/null +++ b/vllm/model_executor/models/internlm.py @@ -0,0 +1,300 @@ +# -*- coding: utf-8 -*- +from typing import Any, Dict, List, Optional, Tuple + +import torch +from torch import nn +from transformers import LlamaConfig + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.attention import PagedAttention +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (LinearMethodBase, + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding, ParallelLMHead) +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_world_size) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.weight_utils import (default_weight_loader, + hf_model_weights_iterator) +from vllm.sequence import SamplerOutput + +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] + + +class InternLMMLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + linear_method=linear_method) + self.down_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + linear_method=linear_method) + if hidden_act != "silu": + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now.") + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class InternLMAttention(nn.Module): + + def __init__( + self, + hidden_size: int, + num_heads: int, + bias: bool, + rope_theta: float = 10000, + max_position_embeddings: int = 8192, + linear_method: Optional[LinearMethodBase] = None, + rope_scaling: Optional[Dict[str, Any]] = None, + ): + super().__init__() + self.hidden_size = hidden_size + tensor_model_parallel_world_size = ( + get_tensor_model_parallel_world_size()) + self.total_num_heads = num_heads + assert self.total_num_heads % tensor_model_parallel_world_size == 0 + self.num_heads = (self.total_num_heads // + tensor_model_parallel_world_size) + self.head_dim = hidden_size // self.total_num_heads + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + self.qkv_proj = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + bias=bias, + linear_method=linear_method, + ) + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=bias, + linear_method=linear_method, + ) + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=self.max_position_embeddings, + base=self.rope_theta, + rope_scaling=rope_scaling, + ) + self.attn = PagedAttention(self.num_heads, self.head_dim, self.scaling) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.chunk(chunks=3, dim=-1) + q, k = self.rotary_emb(positions, q, k) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) + output, _ = self.o_proj(attn_output) + return output + + +class InternLMDecoderLayer(nn.Module): + + def __init__( + self, + config: LlamaConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 10000) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + self.self_attn = InternLMAttention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + bias=config.bias, + rope_theta=rope_theta, + max_position_embeddings=max_position_embeddings, + linear_method=linear_method, + rope_scaling=getattr(config, "rope_scaling", None), + ) + self.mlp = InternLMMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + linear_method=linear_method, + ) + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: KVCache, + input_metadata: InputMetadata, + residual: Optional[torch.Tensor], + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + input_metadata=input_metadata, + ) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + hidden_states = self.mlp(hidden_states) + return hidden_states, residual + + +class InternLMModel(nn.Module): + + def __init__( + self, + config: LlamaConfig, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.config = config + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + vocab_size = ((config.vocab_size + 63) // 64) * 64 + self.embed_tokens = VocabParallelEmbedding( + vocab_size, + config.hidden_size, + ) + self.layers = nn.ModuleList([ + InternLMDecoderLayer(config, linear_method) + for _ in range(config.num_hidden_layers) + ]) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.embed_tokens(input_ids) + residual = None + for i in range(len(self.layers)): + layer = self.layers[i] + hidden_states, residual = layer( + positions, + hidden_states, + kv_caches[i], + input_metadata, + residual, + ) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class InternLMForCausalLM(nn.Module): + + def __init__( + self, + config, + linear_method: Optional[LinearMethodBase] = None, + ): + super().__init__() + self.config = config + self.linear_method = linear_method + self.model = InternLMModel(config, linear_method) + self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) + self.sampler = Sampler(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> torch.Tensor: + hidden_states = self.model(input_ids, positions, kv_caches, + input_metadata) + return hidden_states + + def sample( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.lm_head.weight, hidden_states, + sampling_metadata) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + for name, loaded_weight in hf_model_weights_iterator( + model_name_or_path, cache_dir, load_format, revision): + if "rotary_emb.inv_freq" in name: + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index ebf1d8a89a022..ac19238992d4d 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -24,7 +24,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class InternLM2MLP(nn.Module): @@ -129,8 +129,9 @@ def forward( qkv, _ = self.wqkv(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.wo(attn_output) return output diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index d35887cc0f6a3..1c9f39861bd36 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -47,7 +47,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class LlamaMLP(nn.Module): @@ -155,8 +155,9 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 0100624a44d78..ddc75d9757b42 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -50,7 +50,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class MixtralMoE(nn.Module): @@ -215,8 +215,9 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index a8dadce24aa1d..b72bfe869be14 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -51,7 +51,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class MixtralMLP(nn.Module): @@ -232,8 +232,9 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 22a876e2ef691..28bdc3301c67f 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -24,7 +24,7 @@ from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs.mpt import MPTConfig -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] def _get_alibi_slopes( @@ -126,8 +126,9 @@ def forward( if self.qk_ln: q = self.q_ln(q) k = self.k_ln(k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.out_proj(attn_output) return output diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 393b2dcabcd5a..0caa424fb2a36 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -41,7 +41,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class OPTLearnedPositionalEmbedding(nn.Embedding): @@ -101,9 +101,9 @@ def forward( ) -> torch.Tensor: qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) - key_cache, value_cache = kv_cache + key_cache, value_cache, kv_cache_scaling_factor = kv_cache attn_output = self.attn(q, k, v, key_cache, value_cache, - input_metadata) + kv_cache_scaling_factor, input_metadata) output, _ = self.out_proj(attn_output) return output diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 0b067d4fc8802..c2fc4527aaac3 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -28,7 +28,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class OrionMLP(nn.Module): @@ -133,8 +133,9 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index d143261968288..b6591f51958cb 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -59,7 +59,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class PhiAttention(nn.Module): @@ -120,8 +120,9 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) q, k = self.rotary_emb(position_ids, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.dense(attn_output) return output diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 37af84c7cd53f..c6bc4845f7b06 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -29,7 +29,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class QWenMLP(nn.Module): @@ -116,8 +116,9 @@ def forward( qkv, _ = self.c_attn(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.c_proj(attn_output) return output diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index e823e6f8c3dbe..9bbfd4864e743 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -47,7 +47,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class Qwen2MLP(nn.Module): @@ -151,8 +151,9 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 44c57e5a6d4f9..96788b5ccba09 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -41,7 +41,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class StablelmMLP(nn.Module): @@ -137,8 +137,9 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 1eda07b724cae..b3303a8a92ab7 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -47,7 +47,7 @@ # NOTE: Please install transformers from source or use transformers>=4.39.0 from transformers import PretrainedConfig as Starcoder2Config -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class Starcoder2Attention(nn.Module): @@ -121,8 +121,9 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) + k_cache, v_cache, kv_cache_scaling_factor = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, + kv_cache_scaling_factor, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index 3570366887e78..8b6b2277bf803 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -261,6 +261,22 @@ def hf_model_weights_iterator( torch.cuda.empty_cache() +def kv_cache_scales_iterator(filename: str) -> Iterator[Tuple[int, torch.Tensor]]: + """ + A simple utility to read in KV cache scaling factors that have been + previously serialized to disk. Used by the CacheEngine to populate its + caches with the appropriate scaling factors. The first object of the pair + is the cache (and model) layer corresponding to the scaling factor, and the + second is the scaling factor itself. Keep this function in sync with the output + of 3rdparty/quantization/extract_scales.py and with the scaling factor structure + assumed to hold in worker/cache_engine.py + """ + with open(filename) as f: + layer_scale_factor_map = json.load(f, parse_int=int, parse_constant=float) + for layer_idx, scale_factor in layer_scale_factor_map.items(): + yield int(layer_idx), float(scale_factor) + + def convert_pyslice_to_tensor(x: Any) -> torch.Tensor: """convert PySafeSlice object from safetensors to torch.Tensor diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 880299783935c..f1136c40a7e27 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -1,15 +1,18 @@ """CacheEngine class for managing the KV cache.""" -from typing import Dict, List, Tuple +from typing import Dict, List, Optional, Tuple import torch +from vllm._C import cache_ops from vllm.config import CacheConfig, ModelConfig, ParallelConfig from vllm.logger import init_logger from vllm.utils import in_wsl, is_neuron, STR_DTYPE_TO_TORCH_DTYPE +from vllm.model_executor.weight_utils import kv_cache_scales_iterator +from vllm.utils import in_wsl, STR_DTYPE_TO_TORCH_DTYPE logger = init_logger(__name__) -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] class CacheEngine: @@ -46,11 +49,20 @@ def __init__( self.dtype = model_config.dtype else: self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] + + # We enable cache scaling factors if and only if cache is FP8-typed + self.use_scaling_factor = torch.tensor([], dtype=self.dtype).element_size() == 1 # Initialize the cache. self.gpu_cache = self.allocate_gpu_cache() self.cpu_cache = self.allocate_cpu_cache() + # Load scaling factors into the GPU cache if values are specified + # We do not need to load them into the CPU cache because they are + # never swapped out. + if self.cache_config.kv_cache_scales is not None: + self.load_kv_cache_scales(self.cache_config.kv_cache_scales) + # Initialize the stream for caching operations. self.cache_stream = torch.cuda.Stream() assert self.cache_stream != torch.cuda.current_stream() @@ -89,7 +101,15 @@ def allocate_gpu_cache(self) -> List[KVCache]: dtype=self.dtype, device="cuda", ) - gpu_cache.append((key_blocks, value_blocks)) + if self.use_scaling_factor: + scaling_factor = torch.ones( + 1, + dtype=torch.float32, + device='cuda', + ) + else: + scaling_factor = None + gpu_cache.append((key_blocks, value_blocks, scaling_factor)) return gpu_cache def allocate_cpu_cache(self) -> List[KVCache]: @@ -115,7 +135,9 @@ def allocate_cpu_cache(self) -> List[KVCache]: pin_memory=pin_memory, device="cpu", ) - cpu_cache.append((key_blocks, value_blocks)) + # Scale factors are not involved in the swap process and never need to reside on CPU + scaling_factor = None + cpu_cache.append((key_blocks, value_blocks, scaling_factor)) return cpu_cache def _swap( @@ -128,8 +150,13 @@ def _swap( with torch.cuda.stream(self.cache_stream): for i in range(self.num_layers): - src_key_cache, src_value_cache = src[i] - dst_key_cache, dst_value_cache = dst[i] + src_key_cache, src_value_cache, src_scaling = src[i] + dst_key_cache, dst_value_cache, dst_scaling = dst[i] + # We should not need to copy scaling factors, as they are equal + # given a fixed layer + # TODO(mattwong) Remove this once confirmed + if self.use_scaling_factor: + assert torch.equal(src_scaling, dst_scaling) # Copy the key blocks. cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst) # Copy the value blocks. @@ -145,12 +172,18 @@ def swap_out(self, src_to_dst: Dict[int, int]) -> None: self._swap(self.gpu_cache, self.cpu_cache, src_to_dst) def copy(self, src_to_dsts: Dict[int, List[int]]) -> None: - from vllm._C import cache_ops - - key_caches = [key_cache for key_cache, _ in self.gpu_cache] - value_caches = [value_cache for _, value_cache in self.gpu_cache] + key_caches = [key_cache for key_cache, _, _ in self.gpu_cache] + value_caches = [value_cache for _, value_cache, _ in self.gpu_cache] # NOTE(woosuk): This operation implicitly synchronizes the CPU and GPU. cache_ops.copy_blocks(key_caches, value_caches, src_to_dsts) + + # Helper function to load in static KV cache scaling factors (one per layer) + # stored in a given file. These scaling factors are assumed to not take up + # too much space and are hence permanently resident on GPU. + def load_kv_cache_scales(self, filename: str): + for layer_idx, scaling_factor in kv_cache_scales_iterator(filename): + self.gpu_cache[layer_idx][2].copy_(scaling_factor) + @staticmethod def get_cache_block_size( @@ -171,6 +204,11 @@ def get_cache_block_size( else: dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype] dtype_size = _get_dtype_size(dtype) + + use_scaling_factor = dtype_size == 1 + if use_scaling_factor: + return dtype_size * total + num_layers * 4 + return dtype_size * total diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index efe570778fb43..e41e212e59a18 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -25,7 +25,7 @@ logger = init_logger(__name__) -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] _PAD_SLOT_ID = -1 LORA_WARMUP_RANK = 8 # Capture graphs for batch size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256. @@ -564,7 +564,7 @@ def prepare_input_tensors( def execute_model( self, seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], + kv_caches: List[KVCache], ) -> Optional[SamplerOutput]: (input_tokens, input_positions, input_metadata, sampling_metadata, lora_requests, @@ -643,7 +643,7 @@ def profile_run(self) -> None: # Run the model with the dummy inputs. num_layers = self.model_config.get_num_layers(self.parallel_config) - kv_caches = [(None, None)] * num_layers + kv_caches = [(None, None, None)] * num_layers self.execute_model(seqs, kv_caches) torch.cuda.synchronize() return @@ -823,7 +823,7 @@ def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, - kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], + kv_caches: List[KVCache], input_metadata: InputMetadata, ) -> torch.Tensor: # KV caches are fixed tensors, so we don't need to copy them. From 030c9eb82cd33bace1b57a9ddb5465ca735dc8dd Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Thu, 15 Feb 2024 18:08:50 +0000 Subject: [PATCH 083/159] Initial conversion back to KV cache scales in model, using float scaling factor instead of Tensor, should be working out the box --- benchmarks/benchmark_throughput.py | 16 +++++- vllm/config.py | 13 +++-- vllm/engine/arg_utils.py | 14 +++--- vllm/engine/llm_engine.py | 2 +- vllm/model_executor/layers/attention.py | 33 +++++++++---- .../layers/triton_kernel/prefix_prefill.py | 6 ++- vllm/model_executor/models/baichuan.py | 7 ++- vllm/model_executor/models/bloom.py | 7 ++- vllm/model_executor/models/chatglm.py | 5 +- vllm/model_executor/models/deepseek.py | 7 ++- vllm/model_executor/models/falcon.py | 7 ++- vllm/model_executor/models/gemma.py | 7 ++- vllm/model_executor/models/gpt2.py | 5 +- vllm/model_executor/models/gpt_bigcode.py | 5 +- vllm/model_executor/models/gpt_j.py | 7 ++- vllm/model_executor/models/gpt_neox.py | 7 ++- vllm/model_executor/models/internlm.py | 7 ++- vllm/model_executor/models/internlm2.py | 7 ++- vllm/model_executor/models/llama.py | 16 ++++-- vllm/model_executor/models/mixtral.py | 7 ++- vllm/model_executor/models/mixtral_quant.py | 7 ++- vllm/model_executor/models/mpt.py | 7 ++- vllm/model_executor/models/opt.py | 6 +-- vllm/model_executor/models/orion.py | 7 ++- vllm/model_executor/models/phi.py | 7 ++- vllm/model_executor/models/qwen.py | 7 ++- vllm/model_executor/models/qwen2.py | 7 ++- vllm/model_executor/models/stablelm.py | 7 ++- vllm/model_executor/models/starcoder2.py | 7 ++- vllm/model_executor/weight_utils.py | 11 ++--- vllm/worker/cache_engine.py | 49 +++---------------- vllm/worker/model_runner.py | 22 ++++++++- 32 files changed, 160 insertions(+), 169 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 73ff77719f781..625e5b0f79421 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -73,6 +73,7 @@ def run_vllm( enforce_eager: bool, kv_cache_dtype: str, device: str, + kv_cache_scales: Optional[str], ) -> float: from vllm import LLM, SamplingParams llm = LLM( @@ -87,6 +88,7 @@ def run_vllm( enforce_eager=enforce_eager, kv_cache_dtype=kv_cache_dtype, device=device, + kv_cache_scales=kv_cache_scales, ) # Add the requests to the engine. @@ -108,8 +110,9 @@ def run_vllm( start = time.perf_counter() # FIXME(woosuk): Do not use internal method. - llm._run_engine(use_tqdm=True) + outputs = llm._run_engine(use_tqdm=True) end = time.perf_counter() + print(outputs[-1]) return end - start @@ -211,7 +214,7 @@ def main(args: argparse.Namespace): args.seed, args.n, args.use_beam_search, args.trust_remote_code, args.dtype, args.max_model_len, args.enforce_eager, - args.kv_cache_dtype, args.device) + args.kv_cache_dtype, args.device ,args.kv_cache_scales) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -303,6 +306,15 @@ def main(args: argparse.Namespace): default="cuda", choices=["cuda"], help='device type for vLLM execution, supporting CUDA only currently.') + parser.add_argument( + '--kv-cache-scales', + type=str, + default=None, + help='Path to the JSON file containing the KV cache scaling factors. ' + 'This should generally be supplied when KV cache dtype is FP8. Otherwise ' + 'the KV cache scaling factors default to 1.0, which will likely cause ' + 'accuracy issues. Note FP8 is not supported when cuda version is ' + 'lower than 11.8.') args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/vllm/config.py b/vllm/config.py index 0fdb27a81c49e..abbd75a69c979 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -27,6 +27,10 @@ class ModelConfig: downloading the model and tokenizer. download_dir: Directory to download and load the weights, default to the default cache directory of huggingface. + kv_cache_scales: Path to file containing a JSON serialization of a map + of layer indices to their respective KV cache scaling factors. Used to + load aforementioned scaling factors into the model when KV cache type + is FP8. load_format: The format of the model weights to load: "auto" will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is @@ -69,6 +73,7 @@ def __init__( tokenizer_mode: str, trust_remote_code: bool, download_dir: Optional[str], + kv_cache_scales: Optional[str], load_format: str, dtype: Union[str, torch.dtype], seed: int, @@ -85,6 +90,7 @@ def __init__( self.tokenizer_mode = tokenizer_mode self.trust_remote_code = trust_remote_code self.download_dir = download_dir + self.kv_cache_scales = kv_cache_scales self.load_format = load_format self.seed = seed self.revision = revision @@ -302,7 +308,6 @@ def __init__( gpu_memory_utilization: float, swap_space: int, cache_dtype: str, - kv_cache_scales: Optional[str] = None, sliding_window: Optional[int] = None, ) -> None: self.block_size = block_size @@ -310,7 +315,6 @@ def __init__( self.swap_space_bytes = swap_space * _GB self.cache_dtype = cache_dtype self.sliding_window = sliding_window - self.kv_cache_scales = kv_cache_scales self._verify_args() self._verify_cache_dtype() @@ -332,11 +336,6 @@ def _verify_cache_dtype(self) -> None: if self.cache_dtype == "auto": pass elif self.cache_dtype == "fp8": - if self.kv_cache_scales is None: - logger.warn(f"Using cache dtype {self.cache_dtype} but no " - "scaling factors provided. Defaulting to 1.0 " - "scales, be warned that this might lead to " - "inaccurate results!") if not is_hip(): nvcc_cuda_version = get_nvcc_cuda_version() if nvcc_cuda_version < Version("11.8"): diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 2ae7ae57e4a40..4baaffaae159e 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -294,16 +294,16 @@ def create_engine_configs( ) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig, DeviceConfig, Optional[LoRAConfig]]: device_config = DeviceConfig(self.device) - model_config = ModelConfig( - self.model, self.tokenizer, self.tokenizer_mode, - self.trust_remote_code, self.download_dir, self.load_format, - self.dtype, self.seed, self.revision, self.code_revision, - self.tokenizer_revision, self.max_model_len, self.quantization, - self.enforce_eager, self.max_context_len_to_capture) + model_config = ModelConfig(self.model, self.tokenizer, + self.tokenizer_mode, self.trust_remote_code, + self.download_dir, self.kv_cache_scales, self.load_format, + self.dtype, self.seed, self.revision, + self.tokenizer_revision, self.max_model_len, + self.quantization, self.enforce_eager, + self.max_context_len_to_capture) cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, - self.kv_cache_scales, model_config.get_sliding_window()) parallel_config = ParallelConfig(self.pipeline_parallel_size, self.tensor_parallel_size, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index eb246c7abe3cd..abdc25826e07d 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -101,7 +101,7 @@ def __init__( f"quantization={model_config.quantization}, " f"enforce_eager={model_config.enforce_eager}, " f"kv_cache_dtype={cache_config.cache_dtype}, " - f"kv_cache_scales={cache_config.kv_cache_scales}, " + f"kv_cache_scales={model_config.kv_cache_scales}, " f"device_config={device_config.device}, " f"seed={model_config.seed})") # TODO(woosuk): Print more configs in debug mode. diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index 61d4e9d172c6a..6e95149f11329 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -13,6 +13,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.triton_kernel.prefix_prefill import ( context_attention_fwd) +from vllm.model_executor.layers.linear import set_weight_attrs from vllm.utils import is_hip _SUPPORTED_HEAD_SIZES = [64, 80, 96, 112, 128, 256] @@ -51,6 +52,11 @@ def __init__( if alibi_slopes is not None: alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) self.register_buffer("alibi_slopes", alibi_slopes, persistent=False) + + # This will be set to a float by the model initialization + # if and only if we are using it. Note that this implies we are + # supporting only scalar per-tensor scaling factors for now. + self.kv_cache_scaling_factor = None assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads @@ -100,7 +106,6 @@ def forward( value: torch.Tensor, key_cache: Optional[torch.Tensor], value_cache: Optional[torch.Tensor], - kv_cache_scaling_factor: Optional[torch.Tensor], input_metadata: InputMetadata, ) -> torch.Tensor: """PagedAttention forward pass. @@ -122,17 +127,21 @@ def forward( query = query.view(-1, self.num_heads, self.head_size) key = key.view(-1, self.num_kv_heads, self.head_size) value = value.view(-1, self.num_kv_heads, self.head_size) - + + # Store this here as it will be modified if we perform KV cache scaling + softmax_scale = self.scale # Reshape the keys and values and store them in the cache. # If key_cache and value_cache are not provided, the new key and value # vectors will not be cached. This happens during the initial memory # profiling run. if key_cache is not None and value_cache is not None: - if kv_cache_scaling_factor is not None: - # Scale the key and value scaling factors for quantization - # by cache ops - key = key.div_(kv_cache_scaling_factor) - value = value.div_(kv_cache_scaling_factor) + # Pre-scale K, V tensors; quantization done by cache_ops + # We will correct for the effects of scaling later + if self.kv_cache_scaling_factor is not None: + key.div_(self.kv_cache_scaling_factor) + value.div_(self.kv_cache_scaling_factor) + # This corrects for the K-tensor scaling. + softmax_scale *= self.kv_cache_scaling_factor cache_ops.reshape_and_cache( key, value, @@ -207,7 +216,7 @@ def forward( value, attn_bias=input_metadata.attn_bias, p=0.0, - scale=self.scale, + scale=softmax_scale, op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if (is_hip()) else None, ) @@ -228,6 +237,7 @@ def forward( input_metadata.context_lens, input_metadata.max_seq_len, getattr(self, "alibi_slopes", None), + softmax_scale, ) else: @@ -238,10 +248,13 @@ def forward( value_cache, input_metadata, self.num_kv_heads, - self.scale, + softmax_scale, self.alibi_slopes, ) - + # Correct for the V tensor scaling if it took place + if key_cache is not None and value_cache is not None and \ + self.kv_cache_scaling_factor is not None: + output.mul_(self.kv_cache_scaling_factor) # Reshape the output tensor. return output.view(batch_size, seq_len, hidden_size) diff --git a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py index 70f09224f1cf6..f0b7cf82587d7 100644 --- a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py +++ b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py @@ -629,7 +629,8 @@ def context_attention_fwd(q, b_seq_len, b_ctx_len, max_input_len, - alibi_slopes=None): + alibi_slopes=None, + sm_scale=None): cap = torch.cuda.get_device_capability() BLOCK = 128 if cap[0] >= 8 else 64 @@ -638,7 +639,8 @@ def context_attention_fwd(q, assert Lq == Lk and Lk == Lv assert Lk in {16, 32, 64, 128} - sm_scale = 1.0 / (Lq**0.5) + if sm_scale is None: + sm_scale = 1.0 / (Lq**0.5) batch, head = b_seq_len.shape[0], q.shape[1] num_queries_per_kv = q.shape[1] // k.shape[1] diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index cde371d0a7b7f..550dec6487f9e 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -44,7 +44,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: @@ -177,9 +177,8 @@ def forward( q, k, v = qkv.chunk(chunks=3, dim=-1) if self.postion_embedding != "ALIBI": q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index f9954849bc081..4adfb6b78102f 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -40,7 +40,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: @@ -122,9 +122,8 @@ def forward( del position_ids # Unused. qkv, _ = self.query_key_value(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.dense(attn_output) return output diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 3fc7cafaa006f..dca8d724f976b 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -28,7 +28,7 @@ from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs import ChatGLMConfig -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class GLMAttention(nn.Module): @@ -104,14 +104,13 @@ def forward( qkv, _ = self.query_key_value(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(position_ids, q, k) - key_cache, value_cache, kv_cache_scaling_factor = kv_cache + key_cache, value_cache = kv_cache context_layer = self.attn( q, k, v, key_cache, value_cache, - kv_cache_scaling_factor, input_metadata, ) attn_output, _ = self.dense(context_layer) diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index 1cba98e625cc6..6dba952736921 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -50,7 +50,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class DeepseekMLP(nn.Module): @@ -244,9 +244,8 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index e7e8271e01c5b..2b5e022312e3b 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -47,7 +47,7 @@ from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs import RWConfig -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] FalconConfig = Union[HF_FalconConfig, RWConfig] @@ -185,9 +185,8 @@ def forward( q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) if self.use_rotary: q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) attn_output, bias = self.dense(attn_output) return attn_output, bias diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 34cecb9f97bd7..03948132d32c3 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -40,7 +40,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class GemmaMLP(nn.Module): @@ -138,9 +138,8 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 6fcb13f177113..661da0fe0434e 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -40,7 +40,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class GPT2Attention(nn.Module): @@ -85,9 +85,8 @@ def forward( ) -> torch.Tensor: qkv, _ = self.c_attn(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) - key_cache, value_cache, kv_cache_scaling_factor = kv_cache + key_cache, value_cache = kv_cache attn_output = self.attn(q, k, v, key_cache, value_cache, - kv_cache_scaling_factor, input_metadata) attn_output, _ = self.c_proj(attn_output) return attn_output diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index 79993d938f571..ef4c1d4143c88 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -41,7 +41,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class GPTBigCodeAttention(nn.Module): @@ -104,9 +104,8 @@ def forward( ], dim=-1, ) - key_cache, value_cache, kv_cache_scaling_factor = kv_cache + key_cache, value_cache = kv_cache attn_output = self.attn(q, k, v, key_cache, value_cache, - kv_cache_scaling_factor, input_metadata) attn_output, _ = self.c_proj(attn_output) return attn_output diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 642a53562662a..5bab30d9d442e 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -40,7 +40,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class GPTJAttention(nn.Module): @@ -98,9 +98,8 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) q, k = self.rotary_emb(position_ids, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) attn_output, _ = self.out_proj(attn_output) return attn_output diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 1ffb4aa6fc8df..8f7e1063e0c1d 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -40,7 +40,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class GPTNeoXAttention(nn.Module): @@ -99,9 +99,8 @@ def forward( qkv, _ = self.query_key_value(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) q, k = self.rotary_emb(position_ids, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.dense(attn_output) return output diff --git a/vllm/model_executor/models/internlm.py b/vllm/model_executor/models/internlm.py index 0ba01243e4eac..5d0b93793c89d 100644 --- a/vllm/model_executor/models/internlm.py +++ b/vllm/model_executor/models/internlm.py @@ -24,7 +24,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class InternLMMLP(nn.Module): @@ -114,9 +114,8 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index ac19238992d4d..ebf1d8a89a022 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -24,7 +24,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class InternLM2MLP(nn.Module): @@ -129,9 +129,8 @@ def forward( qkv, _ = self.wqkv(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.wo(attn_output) return output diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 1c9f39861bd36..e4a1fcf748a97 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -44,10 +44,11 @@ get_tensor_model_parallel_world_size) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) + hf_model_weights_iterator, + kv_cache_scales_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class LlamaMLP(nn.Module): @@ -155,9 +156,8 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.o_proj(attn_output) return output @@ -390,3 +390,9 @@ def load_weights(self, weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + + # Should not be called unless the KV cache dtype is FP8 + def load_kv_cache_scales(self, filename: str) -> None: + for layer_idx, scaling_factor in kv_cache_scales_iterator(filename): + layer_paged_attn = self.model.layers[layer_idx].self_attn.attn + layer_paged_attn.kv_cache_scaling_factor = scaling_factor diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index ddc75d9757b42..0100624a44d78 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -50,7 +50,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class MixtralMoE(nn.Module): @@ -215,9 +215,8 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index b72bfe869be14..a8dadce24aa1d 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -51,7 +51,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class MixtralMLP(nn.Module): @@ -232,9 +232,8 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 28bdc3301c67f..22a876e2ef691 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -24,7 +24,7 @@ from vllm.sequence import SamplerOutput from vllm.transformers_utils.configs.mpt import MPTConfig -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] def _get_alibi_slopes( @@ -126,9 +126,8 @@ def forward( if self.qk_ln: q = self.q_ln(q) k = self.k_ln(k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.out_proj(attn_output) return output diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 0caa424fb2a36..393b2dcabcd5a 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -41,7 +41,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class OPTLearnedPositionalEmbedding(nn.Embedding): @@ -101,9 +101,9 @@ def forward( ) -> torch.Tensor: qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) - key_cache, value_cache, kv_cache_scaling_factor = kv_cache + key_cache, value_cache = kv_cache attn_output = self.attn(q, k, v, key_cache, value_cache, - kv_cache_scaling_factor, input_metadata) + input_metadata) output, _ = self.out_proj(attn_output) return output diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index c2fc4527aaac3..0b067d4fc8802 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -28,7 +28,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class OrionMLP(nn.Module): @@ -133,9 +133,8 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index b6591f51958cb..d143261968288 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -59,7 +59,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class PhiAttention(nn.Module): @@ -120,9 +120,8 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) q, k = self.rotary_emb(position_ids, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.dense(attn_output) return output diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index c6bc4845f7b06..37af84c7cd53f 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -29,7 +29,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class QWenMLP(nn.Module): @@ -116,9 +116,8 @@ def forward( qkv, _ = self.c_attn(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.c_proj(attn_output) return output diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 9bbfd4864e743..e823e6f8c3dbe 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -47,7 +47,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class Qwen2MLP(nn.Module): @@ -151,9 +151,8 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 96788b5ccba09..44c57e5a6d4f9 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -41,7 +41,7 @@ hf_model_weights_iterator) from vllm.sequence import SamplerOutput -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class StablelmMLP(nn.Module): @@ -137,9 +137,8 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index b3303a8a92ab7..1eda07b724cae 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -47,7 +47,7 @@ # NOTE: Please install transformers from source or use transformers>=4.39.0 from transformers import PretrainedConfig as Starcoder2Config -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class Starcoder2Attention(nn.Module): @@ -121,9 +121,8 @@ def forward( qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache, kv_cache_scaling_factor = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, - kv_cache_scaling_factor, input_metadata) + k_cache, v_cache = kv_cache + attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) output, _ = self.o_proj(attn_output) return output diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index 8b6b2277bf803..274b858358a9c 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -264,12 +264,11 @@ def hf_model_weights_iterator( def kv_cache_scales_iterator(filename: str) -> Iterator[Tuple[int, torch.Tensor]]: """ A simple utility to read in KV cache scaling factors that have been - previously serialized to disk. Used by the CacheEngine to populate its - caches with the appropriate scaling factors. The first object of the pair - is the cache (and model) layer corresponding to the scaling factor, and the - second is the scaling factor itself. Keep this function in sync with the output - of 3rdparty/quantization/extract_scales.py and with the scaling factor structure - assumed to hold in worker/cache_engine.py + previously serialized to disk. Used by the model to populate the appropriate + KV cache scaling factors. The first object of the pair is the cache (and model) + layer corresponding to the scaling factor, and the second is the scaling factor + itself. Keep this function in sync with the output of + 3rdparty/quantization/extract_scales.py """ with open(filename) as f: layer_scale_factor_map = json.load(f, parse_int=int, parse_constant=float) diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index f1136c40a7e27..9c81bb121ac30 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -1,5 +1,5 @@ """CacheEngine class for managing the KV cache.""" -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Tuple import torch @@ -12,7 +12,7 @@ logger = init_logger(__name__) -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] class CacheEngine: @@ -49,20 +49,11 @@ def __init__( self.dtype = model_config.dtype else: self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] - - # We enable cache scaling factors if and only if cache is FP8-typed - self.use_scaling_factor = torch.tensor([], dtype=self.dtype).element_size() == 1 # Initialize the cache. self.gpu_cache = self.allocate_gpu_cache() self.cpu_cache = self.allocate_cpu_cache() - # Load scaling factors into the GPU cache if values are specified - # We do not need to load them into the CPU cache because they are - # never swapped out. - if self.cache_config.kv_cache_scales is not None: - self.load_kv_cache_scales(self.cache_config.kv_cache_scales) - # Initialize the stream for caching operations. self.cache_stream = torch.cuda.Stream() assert self.cache_stream != torch.cuda.current_stream() @@ -101,15 +92,7 @@ def allocate_gpu_cache(self) -> List[KVCache]: dtype=self.dtype, device="cuda", ) - if self.use_scaling_factor: - scaling_factor = torch.ones( - 1, - dtype=torch.float32, - device='cuda', - ) - else: - scaling_factor = None - gpu_cache.append((key_blocks, value_blocks, scaling_factor)) + gpu_cache.append((key_blocks, value_blocks)) return gpu_cache def allocate_cpu_cache(self) -> List[KVCache]: @@ -135,9 +118,7 @@ def allocate_cpu_cache(self) -> List[KVCache]: pin_memory=pin_memory, device="cpu", ) - # Scale factors are not involved in the swap process and never need to reside on CPU - scaling_factor = None - cpu_cache.append((key_blocks, value_blocks, scaling_factor)) + cpu_cache.append((key_blocks, value_blocks)) return cpu_cache def _swap( @@ -150,13 +131,8 @@ def _swap( with torch.cuda.stream(self.cache_stream): for i in range(self.num_layers): - src_key_cache, src_value_cache, src_scaling = src[i] - dst_key_cache, dst_value_cache, dst_scaling = dst[i] - # We should not need to copy scaling factors, as they are equal - # given a fixed layer - # TODO(mattwong) Remove this once confirmed - if self.use_scaling_factor: - assert torch.equal(src_scaling, dst_scaling) + src_key_cache, src_value_cache = src[i] + dst_key_cache, dst_value_cache = dst[i] # Copy the key blocks. cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst) # Copy the value blocks. @@ -176,14 +152,6 @@ def copy(self, src_to_dsts: Dict[int, List[int]]) -> None: value_caches = [value_cache for _, value_cache, _ in self.gpu_cache] # NOTE(woosuk): This operation implicitly synchronizes the CPU and GPU. cache_ops.copy_blocks(key_caches, value_caches, src_to_dsts) - - # Helper function to load in static KV cache scaling factors (one per layer) - # stored in a given file. These scaling factors are assumed to not take up - # too much space and are hence permanently resident on GPU. - def load_kv_cache_scales(self, filename: str): - for layer_idx, scaling_factor in kv_cache_scales_iterator(filename): - self.gpu_cache[layer_idx][2].copy_(scaling_factor) - @staticmethod def get_cache_block_size( @@ -204,11 +172,6 @@ def get_cache_block_size( else: dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype] dtype_size = _get_dtype_size(dtype) - - use_scaling_factor = dtype_size == 1 - if use_scaling_factor: - return dtype_size * total + num_layers * 4 - return dtype_size * total diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index e41e212e59a18..f16279842f8a1 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -25,7 +25,7 @@ logger = init_logger(__name__) -KVCache = Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] +KVCache = Tuple[torch.Tensor, torch.Tensor] _PAD_SLOT_ID = -1 LORA_WARMUP_RANK = 8 # Capture graphs for batch size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256. @@ -109,6 +109,24 @@ def load_model(self) -> None: self.lora_config, self.device, self.model.embedding_modules, self.model.embedding_padding_modules) self.model = self.lora_manager.create_lora_manager(self.model) + + if self.model_config.kv_cache_scales is not None: + if self.kv_cache_dtype == "fp8": + if callable(getattr(self.model, "load_kv_cache_scales", None)): + self.model.load_kv_cache_scales(self.model_config.kv_cache_scales) + else: + logger.warn("Using FP8 KV cache and scaling factors provided but " + f"model {self.model.__class__} does not support " + "loading scaling factors. Defaulting to 1.0 scales, " + "be warned that this might lead to inaccurate " + "results!") + else: + logger.warn("User provided KV cache scaling factors but these will " + "not be used as the KV cache dtype is not FP8!") + elif self.kv_cache_dtype == "fp8": + logger.warn(f"Using FP8 KV cache but no scaling factors provided. " + "Defaulting to 1.0 scales, be warned that this might " + "lead to inaccurate results!") def set_block_size(self, block_size: int) -> None: self.block_size = block_size @@ -643,7 +661,7 @@ def profile_run(self) -> None: # Run the model with the dummy inputs. num_layers = self.model_config.get_num_layers(self.parallel_config) - kv_caches = [(None, None, None)] * num_layers + kv_caches = [(None, None)] * num_layers self.execute_model(seqs, kv_caches) torch.cuda.synchronize() return From e00a86dfb1ba63b3854f531b05943bbb65a339dc Mon Sep 17 00:00:00 2001 From: HaiShaw Date: Mon, 19 Feb 2024 16:15:09 -0800 Subject: [PATCH 084/159] Completing KV cache scaling factors ingest (TP>1 todo), clean up code; Isolating math works --- benchmarks/benchmark_latency.py | 10 ++++++++ benchmarks/benchmark_throughput.py | 22 ++++++++--------- vllm/config.py | 8 +++---- vllm/engine/arg_utils.py | 16 ++++++------- vllm/engine/llm_engine.py | 2 +- vllm/model_executor/layers/attention.py | 24 ++++--------------- .../layers/triton_kernel/prefix_prefill.py | 3 +-- vllm/model_executor/models/llama.py | 2 +- vllm/worker/cache_engine.py | 4 ++-- vllm/worker/model_runner.py | 20 +++++++--------- 10 files changed, 52 insertions(+), 59 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index b88772f49ccc7..ba11bbd3097f7 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -25,6 +25,7 @@ def main(args: argparse.Namespace): dtype=args.dtype, enforce_eager=args.enforce_eager, kv_cache_dtype=args.kv_cache_dtype, + kv_cache_scales_path=args.kv_cache_scales_path, device=args.device, ) @@ -130,6 +131,15 @@ def run_to_completion(profile_dir: Optional[str] = None): help='Data type for kv cache storage. If "auto", will use model data type. ' 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') + parser.add_argument( + '--kv-cache-scales-path', + type=str, + default=None, + help='Path to the JSON files containing the KV cache scaling factors. ' + 'This should generally be supplied, when KV cache dtype is FP8. Otherwise, ' + 'KV cache scaling factors default to 1.0, which may cause accuracy issues. ' + 'FP8_E5M2 (withour scaling) is only supported on cuda version greater than 11.8. ' + 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') parser.add_argument( '--profile', action='store_true', diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 625e5b0f79421..c2bf5bc268e8a 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -73,7 +73,7 @@ def run_vllm( enforce_eager: bool, kv_cache_dtype: str, device: str, - kv_cache_scales: Optional[str], + kv_cache_scales_path: Optional[str], ) -> float: from vllm import LLM, SamplingParams llm = LLM( @@ -88,7 +88,7 @@ def run_vllm( enforce_eager=enforce_eager, kv_cache_dtype=kv_cache_dtype, device=device, - kv_cache_scales=kv_cache_scales, + kv_cache_scales_path=kv_cache_scales_path, ) # Add the requests to the engine. @@ -214,7 +214,7 @@ def main(args: argparse.Namespace): args.seed, args.n, args.use_beam_search, args.trust_remote_code, args.dtype, args.max_model_len, args.enforce_eager, - args.kv_cache_dtype, args.device ,args.kv_cache_scales) + args.kv_cache_dtype, args.kv_cache_scales_path, args.device) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -307,14 +307,14 @@ def main(args: argparse.Namespace): choices=["cuda"], help='device type for vLLM execution, supporting CUDA only currently.') parser.add_argument( - '--kv-cache-scales', - type=str, - default=None, - help='Path to the JSON file containing the KV cache scaling factors. ' - 'This should generally be supplied when KV cache dtype is FP8. Otherwise ' - 'the KV cache scaling factors default to 1.0, which will likely cause ' - 'accuracy issues. Note FP8 is not supported when cuda version is ' - 'lower than 11.8.') + '--kv-cache-scales-path', + type=str, + default=None, + help='Path to the JSON files containing the KV cache scaling factors. ' + 'This should generally be supplied, when KV cache dtype is FP8. Otherwise, ' + 'KV cache scaling factors default to 1.0, which may cause accuracy issues. ' + 'FP8_E5M2 (withour scaling) is only supported on cuda version greater than 11.8. ' + 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/vllm/config.py b/vllm/config.py index abbd75a69c979..e9f0730941055 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -27,10 +27,10 @@ class ModelConfig: downloading the model and tokenizer. download_dir: Directory to download and load the weights, default to the default cache directory of huggingface. - kv_cache_scales: Path to file containing a JSON serialization of a map + kv_cache_scales_path: Path to files containing JSON serialization of a map of layer indices to their respective KV cache scaling factors. Used to load aforementioned scaling factors into the model when KV cache type - is FP8. + is FP8_E4M3 on ROCm (AMD GPU). load_format: The format of the model weights to load: "auto" will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is @@ -73,7 +73,7 @@ def __init__( tokenizer_mode: str, trust_remote_code: bool, download_dir: Optional[str], - kv_cache_scales: Optional[str], + kv_cache_scales_path: Optional[str], load_format: str, dtype: Union[str, torch.dtype], seed: int, @@ -90,7 +90,7 @@ def __init__( self.tokenizer_mode = tokenizer_mode self.trust_remote_code = trust_remote_code self.download_dir = download_dir - self.kv_cache_scales = kv_cache_scales + self.kv_cache_scales_path = kv_cache_scales_path self.load_format = load_format self.seed = seed self.revision = revision diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 4baaffaae159e..6acf7a616608e 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -18,7 +18,7 @@ class EngineArgs: load_format: str = 'auto' dtype: str = 'auto' kv_cache_dtype: str = 'auto' - kv_cache_scales: str = None + kv_cache_scales_path: str = None seed: int = 0 max_model_len: Optional[int] = None worker_use_ray: bool = False @@ -142,14 +142,14 @@ def add_cli_args( 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') parser.add_argument( - '--kv-cache-scales', + '--kv-cache-scales-path', type=str, default=None, - help='Path to the JSON file containing the KV cache scaling factors. ' - 'This should generally be supplied when KV cache dtype is FP8. Otherwise ' - 'the KV cache scaling factors default to 1.0, which will likely cause ' - 'accuracy issues. Note FP8 is not supported when cuda version is ' - 'lower than 11.8.') + help='Path to the JSON files containing the KV cache scaling factors. ' + 'This should generally be supplied, when KV cache dtype is FP8. Otherwise, ' + 'KV cache scaling factors default to 1.0, which may cause accuracy issues. ' + 'FP8_E5M2 (withour scaling) is only supported on cuda version greater than 11.8. ' + 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') parser.add_argument('--max-model-len', type=int, default=EngineArgs.max_model_len, @@ -296,7 +296,7 @@ def create_engine_configs( device_config = DeviceConfig(self.device) model_config = ModelConfig(self.model, self.tokenizer, self.tokenizer_mode, self.trust_remote_code, - self.download_dir, self.kv_cache_scales, self.load_format, + self.download_dir, self.kv_cache_scales_path, self.load_format, self.dtype, self.seed, self.revision, self.tokenizer_revision, self.max_model_len, self.quantization, self.enforce_eager, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index abdc25826e07d..8d74088b9af07 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -101,7 +101,7 @@ def __init__( f"quantization={model_config.quantization}, " f"enforce_eager={model_config.enforce_eager}, " f"kv_cache_dtype={cache_config.cache_dtype}, " - f"kv_cache_scales={model_config.kv_cache_scales}, " + f"kv_cache_scales_path={model_config.kv_cache_scales_path}, " f"device_config={device_config.device}, " f"seed={model_config.seed})") # TODO(woosuk): Print more configs in debug mode. diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index 6e95149f11329..f616a1f99b3d4 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -53,9 +53,9 @@ def __init__( alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) self.register_buffer("alibi_slopes", alibi_slopes, persistent=False) - # This will be set to a float by the model initialization - # if and only if we are using it. Note that this implies we are - # supporting only scalar per-tensor scaling factors for now. + # This will be set to a float by model initialization per attention, + # if and only if we are using it. N.B. currently we only support per + # tensor scalar scaling factors & only applicable to ROCm (AMD GPU). self.kv_cache_scaling_factor = None assert self.num_heads % self.num_kv_heads == 0 @@ -128,20 +128,11 @@ def forward( key = key.view(-1, self.num_kv_heads, self.head_size) value = value.view(-1, self.num_kv_heads, self.head_size) - # Store this here as it will be modified if we perform KV cache scaling - softmax_scale = self.scale # Reshape the keys and values and store them in the cache. # If key_cache and value_cache are not provided, the new key and value # vectors will not be cached. This happens during the initial memory # profiling run. if key_cache is not None and value_cache is not None: - # Pre-scale K, V tensors; quantization done by cache_ops - # We will correct for the effects of scaling later - if self.kv_cache_scaling_factor is not None: - key.div_(self.kv_cache_scaling_factor) - value.div_(self.kv_cache_scaling_factor) - # This corrects for the K-tensor scaling. - softmax_scale *= self.kv_cache_scaling_factor cache_ops.reshape_and_cache( key, value, @@ -216,7 +207,7 @@ def forward( value, attn_bias=input_metadata.attn_bias, p=0.0, - scale=softmax_scale, + scale=self.scale, op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if (is_hip()) else None, ) @@ -237,7 +228,6 @@ def forward( input_metadata.context_lens, input_metadata.max_seq_len, getattr(self, "alibi_slopes", None), - softmax_scale, ) else: @@ -248,13 +238,9 @@ def forward( value_cache, input_metadata, self.num_kv_heads, - softmax_scale, + self.scale, self.alibi_slopes, ) - # Correct for the V tensor scaling if it took place - if key_cache is not None and value_cache is not None and \ - self.kv_cache_scaling_factor is not None: - output.mul_(self.kv_cache_scaling_factor) # Reshape the output tensor. return output.view(batch_size, seq_len, hidden_size) diff --git a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py index f0b7cf82587d7..fda25f1665ebe 100644 --- a/vllm/model_executor/layers/triton_kernel/prefix_prefill.py +++ b/vllm/model_executor/layers/triton_kernel/prefix_prefill.py @@ -639,8 +639,7 @@ def context_attention_fwd(q, assert Lq == Lk and Lk == Lv assert Lk in {16, 32, 64, 128} - if sm_scale is None: - sm_scale = 1.0 / (Lq**0.5) + sm_scale = 1.0 / (Lq**0.5) batch, head = b_seq_len.shape[0], q.shape[1] num_queries_per_kv = q.shape[1] // k.shape[1] diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index e4a1fcf748a97..cc3872ad6278b 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -391,7 +391,7 @@ def load_weights(self, default_weight_loader) weight_loader(param, loaded_weight) - # Should not be called unless the KV cache dtype is FP8 + # Should not be called unless the KV cache dtype is FP8 on ROCm (AMD GPU) def load_kv_cache_scales(self, filename: str) -> None: for layer_idx, scaling_factor in kv_cache_scales_iterator(filename): layer_paged_attn = self.model.layers[layer_idx].self_attn.attn diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 9c81bb121ac30..2ceab308a9e21 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -148,8 +148,8 @@ def swap_out(self, src_to_dst: Dict[int, int]) -> None: self._swap(self.gpu_cache, self.cpu_cache, src_to_dst) def copy(self, src_to_dsts: Dict[int, List[int]]) -> None: - key_caches = [key_cache for key_cache, _, _ in self.gpu_cache] - value_caches = [value_cache for _, value_cache, _ in self.gpu_cache] + key_caches = [key_cache for key_cache, _ in self.gpu_cache] + value_caches = [value_cache for _, value_cache in self.gpu_cache] # NOTE(woosuk): This operation implicitly synchronizes the CPU and GPU. cache_ops.copy_blocks(key_caches, value_caches, src_to_dsts) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index f16279842f8a1..fafca668e8f7e 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -110,23 +110,21 @@ def load_model(self) -> None: self.model.embedding_padding_modules) self.model = self.lora_manager.create_lora_manager(self.model) - if self.model_config.kv_cache_scales is not None: + if self.model_config.kv_cache_scales_path is not None: if self.kv_cache_dtype == "fp8": if callable(getattr(self.model, "load_kv_cache_scales", None)): - self.model.load_kv_cache_scales(self.model_config.kv_cache_scales) + self.model.load_kv_cache_scales(self.model_config.kv_cache_scales_path) else: logger.warn("Using FP8 KV cache and scaling factors provided but " - f"model {self.model.__class__} does not support " - "loading scaling factors. Defaulting to 1.0 scales, " - "be warned that this might lead to inaccurate " - "results!") + f"model {self.model.__class__} does not support loading " + "scaling factors. Defaulting to scaling factors of 1.0, " + "This may lead to less accurate results!") else: - logger.warn("User provided KV cache scaling factors but these will " - "not be used as the KV cache dtype is not FP8!") + logger.warn("KV cache scaling factors provided, but the KV cache data type is not FP8. " + "KV cache scaling factors will not be used.") elif self.kv_cache_dtype == "fp8": - logger.warn(f"Using FP8 KV cache but no scaling factors provided. " - "Defaulting to 1.0 scales, be warned that this might " - "lead to inaccurate results!") + logger.warn(f"Using FP8 KV cache but no scaling factors provided. Defaulting to " + "scaling factors of 1.0, This may lead to less accurate results!") def set_block_size(self, block_size: int) -> None: self.block_size = block_size From d3e98f3dbdbb35a148ab0a8d2e248291b1310fc7 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Tue, 20 Feb 2024 17:02:35 +0000 Subject: [PATCH 085/159] Fix typos, add a few more sanity checks to the KV cache scales loader, remove PT support from scales extraction utility --- 3rdparty/quantizer/extract_scales.py | 5 +++-- benchmarks/benchmark_latency.py | 2 +- benchmarks/benchmark_throughput.py | 5 ++--- vllm/engine/arg_utils.py | 2 +- vllm/model_executor/weight_utils.py | 4 ++++ 5 files changed, 11 insertions(+), 7 deletions(-) diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py index ace05e52a69b8..ca1d41dcd40cb 100644 --- a/3rdparty/quantizer/extract_scales.py +++ b/3rdparty/quantizer/extract_scales.py @@ -25,7 +25,8 @@ def main(args): for name, param in hf_model_weights_iterator(args.model, args.cache_dir, args.load_format, - args.revision): + args.revision, + fall_back_to_pt=False): if "kv_cache_scaling_factor" in name: nums = [int(s) for s in name.split('.') if s.isdigit()] assert len(nums) == 1, f"Could not determine layer idx for {name}!" @@ -67,7 +68,7 @@ def main(args): parser.add_argument("--load_format", help="Optionally specify the format of the model's tensor files " "containing the KV cache scaling factors.", - choices=["auto", "safetensors", "pt", "npcache"], + choices=["auto", "safetensors", "npcache"], default="auto") parser.add_argument("--revision", help="Optionally specify the model's revision number.", diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index ba11bbd3097f7..e6a2dd2df850b 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -138,7 +138,7 @@ def run_to_completion(profile_dir: Optional[str] = None): help='Path to the JSON files containing the KV cache scaling factors. ' 'This should generally be supplied, when KV cache dtype is FP8. Otherwise, ' 'KV cache scaling factors default to 1.0, which may cause accuracy issues. ' - 'FP8_E5M2 (withour scaling) is only supported on cuda version greater than 11.8. ' + 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') parser.add_argument( '--profile', diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index c2bf5bc268e8a..bc095bbf1300c 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -110,9 +110,8 @@ def run_vllm( start = time.perf_counter() # FIXME(woosuk): Do not use internal method. - outputs = llm._run_engine(use_tqdm=True) + llm._run_engine(use_tqdm=True) end = time.perf_counter() - print(outputs[-1]) return end - start @@ -313,7 +312,7 @@ def main(args: argparse.Namespace): help='Path to the JSON files containing the KV cache scaling factors. ' 'This should generally be supplied, when KV cache dtype is FP8. Otherwise, ' 'KV cache scaling factors default to 1.0, which may cause accuracy issues. ' - 'FP8_E5M2 (withour scaling) is only supported on cuda version greater than 11.8. ' + 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') args = parser.parse_args() if args.tokenizer is None: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 6acf7a616608e..5a94fe3a876b4 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -148,7 +148,7 @@ def add_cli_args( help='Path to the JSON files containing the KV cache scaling factors. ' 'This should generally be supplied, when KV cache dtype is FP8. Otherwise, ' 'KV cache scaling factors default to 1.0, which may cause accuracy issues. ' - 'FP8_E5M2 (withour scaling) is only supported on cuda version greater than 11.8. ' + 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') parser.add_argument('--max-model-len', type=int, diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index 274b858358a9c..f645ace7361a3 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -272,6 +272,10 @@ def kv_cache_scales_iterator(filename: str) -> Iterator[Tuple[int, torch.Tensor] """ with open(filename) as f: layer_scale_factor_map = json.load(f, parse_int=int, parse_constant=float) + if not isinstance(layer_scale_factor_map, dict) or \ + len(layer_scale_factor_map) == 0: + raise RuntimeError(f"File '{filename}' does not specify a valid " + "layer:scale_factor map.") for layer_idx, scale_factor in layer_scale_factor_map.items(): yield int(layer_idx), float(scale_factor) From 714e42cb6713af0e8fd637d30270d7cdae735d3a Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Tue, 20 Feb 2024 19:21:27 +0000 Subject: [PATCH 086/159] Add additional checks to the scaling factor loader and fail gracefully on errors --- vllm/model_executor/weight_utils.py | 38 +++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index f645ace7361a3..a205cc0865620 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -270,14 +270,36 @@ def kv_cache_scales_iterator(filename: str) -> Iterator[Tuple[int, torch.Tensor] itself. Keep this function in sync with the output of 3rdparty/quantization/extract_scales.py """ - with open(filename) as f: - layer_scale_factor_map = json.load(f, parse_int=int, parse_constant=float) - if not isinstance(layer_scale_factor_map, dict) or \ - len(layer_scale_factor_map) == 0: - raise RuntimeError(f"File '{filename}' does not specify a valid " - "layer:scale_factor map.") - for layer_idx, scale_factor in layer_scale_factor_map.items(): - yield int(layer_idx), float(scale_factor) + try: + with open(filename) as f: + # For now we do not obtain any of the benefits of iterators + # but since the number of layers = number of scales is typically + # small, this is not a concern. Loading and processing the entire + # dictionary at once allows us to do sanity checks all at once and + # avoid a situation where we have to abort after having partially + # loaded scaling factors + raw_map = json.load(f, parse_int=int, parse_constant=float) + if not isinstance(raw_map, dict) or len(raw_map) == 0: + raise RuntimeError(f"File '{filename}' does not specify a valid " + "layer:scale_factor map.") + # If any of the inputs are malformed, it will raise an error here and + # be caught in except + layer_scales_map = {int(layer_idx): float(scale) + for layer_idx, scale in raw_map.items()} + return layer_scales_map.items() + + except FileNotFoundError: + logger.error(f"File '{filename}' not found.") + except json.JSONDecodeError: + logger.error(f"Error decoding JSON in file '{filename}'.") + except Exception as e: + logger.error(f"An error occurred while reading file '{filename}': {e}") + # This section is only reached if any of the excepts are hit + # Return an empty iterator (tuple) => no KV cache scales are loaded + # which effectively defaults to 1.0 scales + logger.warn("Defaulting to KV cache scaling factors = 1.0 as an error " + "occurred while trying to load them from file.") + return () def convert_pyslice_to_tensor(x: Any) -> torch.Tensor: From 3ff51b164fed3116f2efeebeb860c5a8164462f1 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Tue, 20 Feb 2024 19:33:41 +0000 Subject: [PATCH 087/159] Remove lingering PT fallback in extraction utility --- 3rdparty/quantizer/extract_scales.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py index ca1d41dcd40cb..63104bfb32be9 100644 --- a/3rdparty/quantizer/extract_scales.py +++ b/3rdparty/quantizer/extract_scales.py @@ -15,7 +15,8 @@ def main(args): hf_folder, _, _ = prepare_hf_model_weights(args.model, args.cache_dir, args.load_format, - revision=args.revision) + revision=args.revision, + fall_back_to_pt=False) output_file = os.path.join(hf_folder, default_output_name) else: output_file = os.path.join(args.output, default_output_name) From e97a31e66d77fbb62d5eb6627a8a2e9d70628e75 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Tue, 20 Feb 2024 19:44:04 +0000 Subject: [PATCH 088/159] Add ROCm clarification to extract scales script --- 3rdparty/quantizer/extract_scales.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py index 63104bfb32be9..a94c4d7d379c9 100644 --- a/3rdparty/quantizer/extract_scales.py +++ b/3rdparty/quantizer/extract_scales.py @@ -55,8 +55,8 @@ def main(args): "and saves them to a JSON file compatible with later " "use by vLLM (pass this file to the appropriate " "runtime typically using the argument " - "--kv-cache-scales ). This is only used " - "if the KV cache dtype is FP8.") + "--kv-cache-scales-path ). This is only used " + "if the KV cache dtype is FP8 and on ROCm (AMD GPU).") parser.add_argument("--model", help="Specify either a directory or name of a HF model. If the model " "does not exist, this utility will attempt to download said model " From 0ba975df9c82fe23599b87ffb1859c4d976d2f14 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Fri, 23 Feb 2024 20:57:20 +0000 Subject: [PATCH 089/159] Add scaling factor correction for ROCm FP8 --- vllm/model_executor/layers/attention.py | 4 ++++ vllm/model_executor/models/llama.py | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index f616a1f99b3d4..2c775eed0e665 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -56,6 +56,10 @@ def __init__( # This will be set to a float by model initialization per attention, # if and only if we are using it. N.B. currently we only support per # tensor scalar scaling factors & only applicable to ROCm (AMD GPU). + # The scaling factor convention we are assuming is + # quantized_value * scaling_factor ~= true_value + # which is consistent with the practice of setting + # scaling_factor = tensor_amax / FPtype_max self.kv_cache_scaling_factor = None assert self.num_heads % self.num_kv_heads == 0 diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index cc3872ad6278b..5cef260b35642 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -47,6 +47,8 @@ hf_model_weights_iterator, kv_cache_scales_iterator) from vllm.sequence import SamplerOutput +from vllm.config import LoRAConfig +from vllm.utils import is_hip KVCache = Tuple[torch.Tensor, torch.Tensor] @@ -395,4 +397,10 @@ def load_weights(self, def load_kv_cache_scales(self, filename: str) -> None: for layer_idx, scaling_factor in kv_cache_scales_iterator(filename): layer_paged_attn = self.model.layers[layer_idx].self_attn.attn + if is_hip(): + # The scaling factor convention we are assuming is + # quantized_value * scaling_factor ~= true_value + # which is consistent with the practice of setting + # scaling_factor = tensor_amax / FPtype_max + scaling_factor *= 2 layer_paged_attn.kv_cache_scaling_factor = scaling_factor From 6991c5993beb8923302b499f0d6cf2bdb22eea54 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Fri, 23 Feb 2024 21:52:05 +0000 Subject: [PATCH 090/159] Correcting a stray type hint --- vllm/model_executor/weight_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index a205cc0865620..d49fc345791b3 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -261,7 +261,7 @@ def hf_model_weights_iterator( torch.cuda.empty_cache() -def kv_cache_scales_iterator(filename: str) -> Iterator[Tuple[int, torch.Tensor]]: +def kv_cache_scales_iterator(filename: str) -> Iterator[Tuple[int, float]]: """ A simple utility to read in KV cache scaling factors that have been previously serialized to disk. Used by the model to populate the appropriate From 229277666a4daf19d0414884a8d857708ad9b176 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Wed, 21 Feb 2024 21:12:59 +0000 Subject: [PATCH 091/159] Preliminary TP rank > 1 extraction and loading support --- 3rdparty/quantizer/extract_scales.py | 232 ++++++++++++++++++++++----- vllm/model_executor/models/llama.py | 7 +- vllm/model_executor/weight_utils.py | 44 +++-- 3 files changed, 224 insertions(+), 59 deletions(-) diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py index a94c4d7d379c9..3c54222bb41d2 100644 --- a/3rdparty/quantizer/extract_scales.py +++ b/3rdparty/quantizer/extract_scales.py @@ -1,52 +1,195 @@ import argparse +import fnmatch +import glob +from huggingface_hub import snapshot_download, HfFileSystem import json +import numpy as np import os -from vllm.model_executor.weight_utils import ( - hf_model_weights_iterator, - prepare_hf_model_weights -) +from safetensors.torch import safe_open +import torch +from typing import List, Optional, Tuple -default_output_name = "kv_cache_scales.json" + +# Adapted from vllm/model_executor/weight_utils.py +# The main differences are that we add the NPZ format and that there's no +# need for a file lock when downloading model weights because this tool is +# not intended to be run on multiple processes simultaneously. +# Since our use case is sufficiently different, we define our own function +# here. +def _prepare_hf_weights( + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + fall_back_to_pt: bool = True, + revision: Optional[str] = None, +) -> Tuple[str, List[str], bool]: + # Download model weights from huggingface. + is_local = os.path.isdir(model_name_or_path) + use_safetensors = False + # Some quantized models use .pt files for storing the weights. + if load_format == "auto": + allow_patterns = ["*.safetensors", "*.bin"] + elif load_format == "safetensors": + use_safetensors = True + allow_patterns = ["*.safetensors"] + elif load_format == "pt": + allow_patterns = ["*.pt"] + elif load_format == "npz": + allow_patterns = ["*.npz"] + else: + raise ValueError(f"Unknown load_format: {load_format}") + + if fall_back_to_pt: + allow_patterns += ["*.pt"] + + if not is_local: + # Before we download we look at that is available: + fs = HfFileSystem() + file_list = fs.ls(model_name_or_path, detail=False, revision=revision) + # depending on what is available we download different things + for pattern in allow_patterns: + matching = fnmatch.filter(file_list, pattern) + if len(matching) > 0: + allow_patterns = [pattern] + break + print(f"Downloading model... Using model weights format {allow_patterns}") + hf_folder = snapshot_download(model_name_or_path, + allow_patterns=allow_patterns, + cache_dir=cache_dir, + revision=revision) + else: + hf_folder = model_name_or_path + hf_weights_files: List[str] = [] + for pattern in allow_patterns: + hf_weights_files += glob.glob(os.path.join(hf_folder, pattern)) + if len(hf_weights_files) > 0: + if pattern == "*.safetensors": + use_safetensors = True + break + if not use_safetensors: + # Exclude files that are not needed for inference. + # https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233 + blacklist = [ + "training_args.bin", + "optimizer.bin", + "optimizer.pt", + "scheduler.pt", + "scaler.pt", + ] + hf_weights_files = [ + f for f in hf_weights_files + if not any(f.endswith(x) for x in blacklist) + ] + + if len(hf_weights_files) == 0: + raise RuntimeError( + f"Cannot find any model weights with `{model_name_or_path}`") + + return hf_folder, hf_weights_files, use_safetensors +# Adapted from vllm/model_executor/weight_utils.py +def _hf_tensorfile_iterator(filename: str, load_format: str, + use_safetensors: bool): + if load_format == "npz": + assert not use_safetensors + with np.load(filename) as data: + for name in data.files: + param = torch.from_numpy(data[name]) + yield name, param + elif use_safetensors: + with safe_open(filename, framework="pt") as f: + for name in f.keys(): + param = f.get_tensor(name) + yield name, param + else: + state = torch.load(filename, map_location="cpu") + for name, param in state.items(): + yield name, param + del state + torch.cuda.empty_cache() + + +# Used by both main and if __name__ == "__main__" +_default_kvcache_scales_filename = "kv_cache_scales.json" + def main(args): - layer_scale_factors_map = {} - if args.output is None: - hf_folder, _, _ = prepare_hf_model_weights(args.model, - args.cache_dir, - args.load_format, - revision=args.revision, - fall_back_to_pt=False) - output_file = os.path.join(hf_folder, default_output_name) + rank_tensors_map = {} + hf_folder, hf_tensor_files, use_safetensors = _prepare_hf_weights( + args.model, + args.cache_dir, + args.load_format, + revision=args.revision, + fall_back_to_pt=True) + # Matches the number immediately after this keyword in the tensor filename to + # determine the TP rank corresponding to said tensor file + rank_keyword = "rank" + for tensor_file in hf_tensor_files: + try: + rank_idx = tensor_file.find(rank_keyword) + if rank_idx != -1: + start_idx = rank_idx + len(rank_keyword) + stop_idx = start_idx + while stop_idx < len(tensor_file) and tensor_file[stop_idx].isdecimal(): + stop_idx += 1 + if stop_idx == start_idx: + raise RuntimeError("Did not find rank # in filename.") + rank = int(tensor_file[start_idx:stop_idx]) + elif len(hf_tensor_files) == 1: + # Since there is only one tensor file, we can assume + # that it's intended for TP rank 0 + rank = 0 + else: + raise RuntimeError(f"Filename does not contain '{rank_keyword}'.") + except RuntimeError: + print("Unable to determine TP rank " + f"corresponding to file '{tensor_file}'") + raise + + if rank not in rank_tensors_map: + layer_scales_map = {} + rank_tensors_map[rank] = layer_scales_map + else: + raise RuntimeError(f"Tensor file '{tensor_file}' shares TP rank {rank} " + "with another tensor file.") + + module_delimiter = ":" if args.load_format == "npz" else "." + for name, param in _hf_tensorfile_iterator(tensor_file, args.load_format, + use_safetensors): + if "kv_cache_scaling_factor" in name: + nums = [int(s) for s in name.split(module_delimiter) if s.isdigit()] + assert len(nums) == 1, f"Could not determine layer idx for {name}" + layer_idx = nums[0] + assert layer_idx not in layer_scales_map, f"Duplicate scaling " \ + f"factor corresponding to layer {layer_idx}" + try: + layer_scales_map[layer_idx] = param.item() + except RuntimeError: + print("This utility supports only per-tensor scalar scale factors " + f"for now. The tensor\n {name} = {param} is an invalid " + "scale factor.") + raise + + if args.output_path is None: + output_file = os.path.join(hf_folder, _default_kvcache_scales_filename) else: - output_file = os.path.join(args.output, default_output_name) - if not os.path.isdir(args.output): - os.makedirs(args.output, exist_ok=True) - - for name, param in hf_model_weights_iterator(args.model, - args.cache_dir, - args.load_format, - args.revision, - fall_back_to_pt=False): - if "kv_cache_scaling_factor" in name: - nums = [int(s) for s in name.split('.') if s.isdigit()] - assert len(nums) == 1, f"Could not determine layer idx for {name}!" - layer_idx = nums[0] - assert layer_idx not in layer_scale_factors_map, f"Duplicate scaling " \ - f"factor corresponding to layer {layer_idx}!" - try: - layer_scale_factors_map[layer_idx] = param.item() - except RuntimeError: - print("This utility supports only per-tensor scalar scale factors " - f"for now. The tensor\n {name} = {param} is an invalid " - "scale factor!") - raise - if len(layer_scale_factors_map) == 0: - print("WARNING: No KV cache scale factors found! No output saved.") + output_file = os.path.join(args.output_path, _default_kvcache_scales_filename) + if not os.path.isdir(args.output_path): + os.makedirs(args.output_path, exist_ok=True) + + if (len(rank_tensors_map) == 0 or + all(len(layer_scales_map) == 0 for layer_scales_map in rank_tensors_map.values())): + print("WARNING: No KV cache scale factors found. No output saved.") else: + tp_world_size = max(rank_tensors_map.keys()) + 1 + for i in range(tp_world_size): + assert i in rank_tensors_map, f"Expected TP world size = {tp_world_size} " \ + "but did not find KV cache scaling factors " \ + f"for TP rank {i}" with open(output_file, 'w') as f: - json.dump(layer_scale_factors_map, f, sort_keys=True) - print(f"Completed! KV cache scaling factors saved to {output_file}") + json.dump(rank_tensors_map, f, sort_keys=True, indent=4) + print(f"Completed! Found TP world size = {tp_world_size}.", + f"KV cache scaling factors saved to {output_file}") if __name__ == "__main__": @@ -69,15 +212,16 @@ def main(args): parser.add_argument("--load_format", help="Optionally specify the format of the model's tensor files " "containing the KV cache scaling factors.", - choices=["auto", "safetensors", "npcache"], + choices=["auto", "safetensors", "npz", "pt"], default="auto") parser.add_argument("--revision", help="Optionally specify the model's revision number.", default=None) - parser.add_argument("--output", - help="Specify the output directory. By default it will be saved in " - f"the model directory with the filename {default_output_name}, " - "however you can override this behavior here.", + parser.add_argument("--output_path", + help="Optionally specify the output directory. By default the " + "scaling factors will be saved in the model directory with the " + f"filename {_default_kvcache_scales_filename}, however you can " + "override this behavior here.", default=None) args = parser.parse_args() diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 5cef260b35642..b604c7adb5116 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -41,6 +41,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE) from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.weight_utils import (default_weight_loader, @@ -395,7 +396,11 @@ def load_weights(self, # Should not be called unless the KV cache dtype is FP8 on ROCm (AMD GPU) def load_kv_cache_scales(self, filename: str) -> None: - for layer_idx, scaling_factor in kv_cache_scales_iterator(filename): + tp_size = get_tensor_model_parallel_world_size() + tp_rank = get_tensor_model_parallel_rank() + for layer_idx, scaling_factor in kv_cache_scales_iterator( + filename, tp_rank, tp_size, + self.model.config.num_hidden_layers): layer_paged_attn = self.model.layers[layer_idx].self_attn.attn if is_hip(): # The scaling factor convention we are assuming is diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index d49fc345791b3..6ee3c51629b0b 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -261,14 +261,18 @@ def hf_model_weights_iterator( torch.cuda.empty_cache() -def kv_cache_scales_iterator(filename: str) -> Iterator[Tuple[int, float]]: +def kv_cache_scales_iterator(filename: str, + tp_rank: int, + tp_size: int, + num_hidden_layers: int) -> Iterator[Tuple[int, torch.Tensor]]: """ A simple utility to read in KV cache scaling factors that have been previously serialized to disk. Used by the model to populate the appropriate - KV cache scaling factors. The first object of the pair is the cache (and model) - layer corresponding to the scaling factor, and the second is the scaling factor - itself. Keep this function in sync with the output of - 3rdparty/quantization/extract_scales.py + KV cache scaling factors. The serialization should represent a dictionary + whose keys are the TP ranks and values are another dictionary mapping layers + to their KV cache scaling factors. + Keep this function in sync with the output + of 3rdparty/quantization/extract_scales.py """ try: with open(filename) as f: @@ -278,14 +282,26 @@ def kv_cache_scales_iterator(filename: str) -> Iterator[Tuple[int, float]]: # dictionary at once allows us to do sanity checks all at once and # avoid a situation where we have to abort after having partially # loaded scaling factors - raw_map = json.load(f, parse_int=int, parse_constant=float) - if not isinstance(raw_map, dict) or len(raw_map) == 0: - raise RuntimeError(f"File '{filename}' does not specify a valid " - "layer:scale_factor map.") - # If any of the inputs are malformed, it will raise an error here and - # be caught in except + raw_rank_map = json.load(f, parse_int=int, parse_constant=float) + + # If any of the inputs are malformed, it raises an error somewhere + # in the following lines and is caught in except + assert isinstance(raw_rank_map, dict), "Did not load a dictionary from file." + assert len(raw_rank_map) != 0, "Loaded dictionary is empty." + for rank, scales_map in raw_rank_map.items(): + assert len(scales_map) == num_hidden_layers, "KV cache scales map for TP rank " \ + f"{rank} is malformed. Expected {num_hidden_layers} layers, got {len(scales_map)}." + for i in range(tp_size): + assert i in raw_rank_map or str(i) in raw_rank_map, "KV cache scales map for TP rank " \ + f"{i} not found." + assert tp_rank in raw_rank_map or str(tp_rank) in raw_rank_map, "Tried to load KV cache " \ + f"scales for TP rank {tp_rank} but these were not found." + raw_layer_scales_map = raw_rank_map.get(tp_rank) or raw_rank_map.get(str(tp_rank)) layer_scales_map = {int(layer_idx): float(scale) - for layer_idx, scale in raw_map.items()} + for layer_idx, scale in raw_layer_scales_map.items()} + for i in range(num_hidden_layers): + assert i in layer_scales_map, "Could not find KV cache scales for layer " \ + f"{i} in TP rank {tp_rank}." return layer_scales_map.items() except FileNotFoundError: @@ -297,8 +313,8 @@ def kv_cache_scales_iterator(filename: str) -> Iterator[Tuple[int, float]]: # This section is only reached if any of the excepts are hit # Return an empty iterator (tuple) => no KV cache scales are loaded # which effectively defaults to 1.0 scales - logger.warn("Defaulting to KV cache scaling factors = 1.0 as an error " - "occurred while trying to load them from file.") + logger.warn(f"Defaulting to KV cache scaling factors = 1.0 for all layers in TP rank {tp_rank}" + " as an error occurred during loading.") return () From 221699b9633f520b2c8ba5669a7269264ed4ce63 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Wed, 21 Feb 2024 21:30:01 +0000 Subject: [PATCH 092/159] Ensure loaded dictionary has same TP size as currently running engine --- vllm/model_executor/weight_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index 6ee3c51629b0b..6abae88cfa75e 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -288,6 +288,9 @@ def kv_cache_scales_iterator(filename: str, # in the following lines and is caught in except assert isinstance(raw_rank_map, dict), "Did not load a dictionary from file." assert len(raw_rank_map) != 0, "Loaded dictionary is empty." + loaded_tp_size = max(int(rank) for rank in raw_rank_map) + 1 + assert loaded_tp_size == tp_size, f"Loaded dictionary has TP size {loaded_tp_size} " \ + f"but LLM engine is currently running with TP size {tp_size}." for rank, scales_map in raw_rank_map.items(): assert len(scales_map) == num_hidden_layers, "KV cache scales map for TP rank " \ f"{rank} is malformed. Expected {num_hidden_layers} layers, got {len(scales_map)}." From 96a7546b24900a5049a23401632d0d56ab85a552 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Fri, 23 Feb 2024 00:48:55 +0000 Subject: [PATCH 093/159] Add tp_size argument for user to specify TP size to expect in quantized model. Renamed --model argument to --quantized_model to be explicit. --- 3rdparty/quantizer/extract_scales.py | 43 ++++++++++++++++++---------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py index 3c54222bb41d2..eebbd88a62d13 100644 --- a/3rdparty/quantizer/extract_scales.py +++ b/3rdparty/quantizer/extract_scales.py @@ -116,7 +116,7 @@ def _hf_tensorfile_iterator(filename: str, load_format: str, def main(args): rank_tensors_map = {} hf_folder, hf_tensor_files, use_safetensors = _prepare_hf_weights( - args.model, + args.quantized_model, args.cache_dir, args.load_format, revision=args.revision, @@ -181,14 +181,18 @@ def main(args): all(len(layer_scales_map) == 0 for layer_scales_map in rank_tensors_map.values())): print("WARNING: No KV cache scale factors found. No output saved.") else: - tp_world_size = max(rank_tensors_map.keys()) + 1 - for i in range(tp_world_size): - assert i in rank_tensors_map, f"Expected TP world size = {tp_world_size} " \ + empirical_tp_world_size = max(rank_tensors_map.keys()) + 1 + if args.tp_size is not None: + assert args.tp_size == empirical_tp_world_size, "User expected TP world size = " \ + f"{args.tp_size} but expecting TP world size = {empirical_tp_world_size} from " \ + "model instead." + for i in range(empirical_tp_world_size): + assert i in rank_tensors_map, f"Expected TP world size = {empirical_tp_world_size} " \ "but did not find KV cache scaling factors " \ f"for TP rank {i}" with open(output_file, 'w') as f: json.dump(rank_tensors_map, f, sort_keys=True, indent=4) - print(f"Completed! Found TP world size = {tp_world_size}.", + print(f"Completed! Found TP world size = {empirical_tp_world_size}.", f"KV cache scaling factors saved to {output_file}") @@ -198,16 +202,14 @@ def main(args): "and saves them to a JSON file compatible with later " "use by vLLM (pass this file to the appropriate " "runtime typically using the argument " - "--kv-cache-scales-path ). This is only used " + "--kv_cache_scales_path ). This is only used " "if the KV cache dtype is FP8 and on ROCm (AMD GPU).") - parser.add_argument("--model", - help="Specify either a directory or name of a HF model. If the model " - "does not exist, this utility will attempt to download said model " - "from the HF repo.", + parser.add_argument("--quantized_model", + help="Specify either the local path to, or name of, a quantized HF model.", required=True) parser.add_argument("--cache_dir", - help="Optionally specify a cache directory to use for a HF model " - "download.", + help="Optionally specify a cache directory to use in the event of a HF " + "model download.", default=None) parser.add_argument("--load_format", help="Optionally specify the format of the model's tensor files " @@ -219,10 +221,21 @@ def main(args): default=None) parser.add_argument("--output_path", help="Optionally specify the output directory. By default the " - "scaling factors will be saved in the model directory with the " - f"filename {_default_kvcache_scales_filename}, however you can " - "override this behavior here.", + "KV cache scaling factors will be saved in the model directory " + f"with the filename {_default_kvcache_scales_filename}, however " + "you can override this behavior here.", default=None) + parser.add_argument("--tp_size", + help="Optionally specify the tensor-parallel (TP) size that the " + "quantized model should correspond to. If specified, during KV " + "cache scaling factor extraction the observed TP size will be " + "checked against this and an error will be raised if there is " + "a mismatch. If not specified, the quantized model's expected " + "TP size is instead inferred from the largest TP rank observed. " + "The expected TP size is cross-checked against the TP ranks " + "observed in the quantized model and an error is raised if any " + "discrepancies are found.", + default=None, type=int) args = parser.parse_args() main(args) From 9d08a92d0714278a46f1f1734f1e5bf538a3c1c2 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Fri, 23 Feb 2024 00:54:44 +0000 Subject: [PATCH 094/159] Add specific FP8 E4M3 and ROCm flavor text to the --quantized_model argument --- 3rdparty/quantizer/extract_scales.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py index eebbd88a62d13..7a26cff7d93af 100644 --- a/3rdparty/quantizer/extract_scales.py +++ b/3rdparty/quantizer/extract_scales.py @@ -205,7 +205,9 @@ def main(args): "--kv_cache_scales_path ). This is only used " "if the KV cache dtype is FP8 and on ROCm (AMD GPU).") parser.add_argument("--quantized_model", - help="Specify either the local path to, or name of, a quantized HF model.", + help="Specify either the local path to, or name of, a quantized HF model. " + "It is expected that the quantization format is FP8_E4M3, for use on ROCm " + "(AMD GPU).", required=True) parser.add_argument("--cache_dir", help="Optionally specify a cache directory to use in the event of a HF " From 553209b25bbe16878d4f2ecc337a9885208fc8c8 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Fri, 23 Feb 2024 01:00:44 +0000 Subject: [PATCH 095/159] Small tweak on expected TP size flavor text for clarity --- 3rdparty/quantizer/extract_scales.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py index 7a26cff7d93af..84a28c33a7d24 100644 --- a/3rdparty/quantizer/extract_scales.py +++ b/3rdparty/quantizer/extract_scales.py @@ -184,8 +184,8 @@ def main(args): empirical_tp_world_size = max(rank_tensors_map.keys()) + 1 if args.tp_size is not None: assert args.tp_size == empirical_tp_world_size, "User expected TP world size = " \ - f"{args.tp_size} but expecting TP world size = {empirical_tp_world_size} from " \ - "model instead." + f"{args.tp_size} from model but tool is expecting TP world size = " \ + f"{empirical_tp_world_size} from model instead." for i in range(empirical_tp_world_size): assert i in rank_tensors_map, f"Expected TP world size = {empirical_tp_world_size} " \ "but did not find KV cache scaling factors " \ From c8525498697d3057805d53aea18e4bab2674261a Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Fri, 23 Feb 2024 19:03:49 +0000 Subject: [PATCH 096/159] Add output filename argument, rename output_path to output_dir, and clean up some logic --- 3rdparty/quantizer/extract_scales.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py index 84a28c33a7d24..ac859f79e88d0 100644 --- a/3rdparty/quantizer/extract_scales.py +++ b/3rdparty/quantizer/extract_scales.py @@ -110,9 +110,6 @@ def _hf_tensorfile_iterator(filename: str, load_format: str, torch.cuda.empty_cache() -# Used by both main and if __name__ == "__main__" -_default_kvcache_scales_filename = "kv_cache_scales.json" - def main(args): rank_tensors_map = {} hf_folder, hf_tensor_files, use_safetensors = _prepare_hf_weights( @@ -157,7 +154,7 @@ def main(args): for name, param in _hf_tensorfile_iterator(tensor_file, args.load_format, use_safetensors): if "kv_cache_scaling_factor" in name: - nums = [int(s) for s in name.split(module_delimiter) if s.isdigit()] + nums = [int(s) for s in name.split(module_delimiter) if s.isdecimal()] assert len(nums) == 1, f"Could not determine layer idx for {name}" layer_idx = nums[0] assert layer_idx not in layer_scales_map, f"Duplicate scaling " \ @@ -171,14 +168,14 @@ def main(args): raise if args.output_path is None: - output_file = os.path.join(hf_folder, _default_kvcache_scales_filename) + output_file = os.path.join(hf_folder, args.output_name) else: - output_file = os.path.join(args.output_path, _default_kvcache_scales_filename) + output_file = os.path.join(args.output_path, args.output_name) if not os.path.isdir(args.output_path): os.makedirs(args.output_path, exist_ok=True) - if (len(rank_tensors_map) == 0 or - all(len(layer_scales_map) == 0 for layer_scales_map in rank_tensors_map.values())): + if all(len(layer_scales_map) == 0 for layer_scales_map in rank_tensors_map.values()): + # Note: this is true even if the rank_tensors_map is empty print("WARNING: No KV cache scale factors found. No output saved.") else: empirical_tp_world_size = max(rank_tensors_map.keys()) + 1 @@ -221,12 +218,14 @@ def main(args): parser.add_argument("--revision", help="Optionally specify the model's revision number.", default=None) - parser.add_argument("--output_path", + parser.add_argument("--output_dir", help="Optionally specify the output directory. By default the " - "KV cache scaling factors will be saved in the model directory " - f"with the filename {_default_kvcache_scales_filename}, however " - "you can override this behavior here.", + "KV cache scaling factors will be saved in the model directory, " + "however you can override this behavior here.", default=None) + parser.add_argument("--output_name", + help="Optionally specify the output filename.", + default="kv_cache_scales.json") parser.add_argument("--tp_size", help="Optionally specify the tensor-parallel (TP) size that the " "quantized model should correspond to. If specified, during KV " From e23379e1559124a98117951f050bd2ee87468694 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Fri, 23 Feb 2024 19:06:22 +0000 Subject: [PATCH 097/159] Fix up remaining 'output_path's from the rename --- 3rdparty/quantizer/extract_scales.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py index ac859f79e88d0..357a971f96830 100644 --- a/3rdparty/quantizer/extract_scales.py +++ b/3rdparty/quantizer/extract_scales.py @@ -167,12 +167,12 @@ def main(args): "scale factor.") raise - if args.output_path is None: + if args.output_dir is None: output_file = os.path.join(hf_folder, args.output_name) else: - output_file = os.path.join(args.output_path, args.output_name) - if not os.path.isdir(args.output_path): - os.makedirs(args.output_path, exist_ok=True) + output_file = os.path.join(args.output_dir, args.output_name) + if not os.path.isdir(args.output_dir): + os.makedirs(args.output_dir, exist_ok=True) if all(len(layer_scales_map) == 0 for layer_scales_map in rank_tensors_map.values()): # Note: this is true even if the rank_tensors_map is empty From 40171c9e7617a271653768ff83fa76667a54c501 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Fri, 23 Feb 2024 21:31:20 +0000 Subject: [PATCH 098/159] Strip out download functionality in scale extraction utility --- 3rdparty/quantizer/extract_scales.py | 62 +++++++--------------------- vllm/model_executor/weight_utils.py | 4 +- 2 files changed, 17 insertions(+), 49 deletions(-) diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py index 357a971f96830..ee5e7d3068610 100644 --- a/3rdparty/quantizer/extract_scales.py +++ b/3rdparty/quantizer/extract_scales.py @@ -11,20 +11,17 @@ # Adapted from vllm/model_executor/weight_utils.py -# The main differences are that we add the NPZ format and that there's no -# need for a file lock when downloading model weights because this tool is -# not intended to be run on multiple processes simultaneously. -# Since our use case is sufficiently different, we define our own function -# here. +# The main differences are that we add the NPZ format and simplify +# its functionality drastically for our purposes (e.g. we assume that +# the quantized model exists locally and there is no need to download it) def _prepare_hf_weights( - model_name_or_path: str, - cache_dir: Optional[str] = None, + quantized_model_dir: str, load_format: str = "auto", fall_back_to_pt: bool = True, - revision: Optional[str] = None, ) -> Tuple[str, List[str], bool]: - # Download model weights from huggingface. - is_local = os.path.isdir(model_name_or_path) + if not os.path.isdir(quantized_model_dir): + raise FileNotFoundError(f"The quantized model directory `{quantized_model_dir}` " + "does not exist.") use_safetensors = False # Some quantized models use .pt files for storing the weights. if load_format == "auto": @@ -38,34 +35,17 @@ def _prepare_hf_weights( allow_patterns = ["*.npz"] else: raise ValueError(f"Unknown load_format: {load_format}") - if fall_back_to_pt: allow_patterns += ["*.pt"] - if not is_local: - # Before we download we look at that is available: - fs = HfFileSystem() - file_list = fs.ls(model_name_or_path, detail=False, revision=revision) - # depending on what is available we download different things - for pattern in allow_patterns: - matching = fnmatch.filter(file_list, pattern) - if len(matching) > 0: - allow_patterns = [pattern] - break - print(f"Downloading model... Using model weights format {allow_patterns}") - hf_folder = snapshot_download(model_name_or_path, - allow_patterns=allow_patterns, - cache_dir=cache_dir, - revision=revision) - else: - hf_folder = model_name_or_path hf_weights_files: List[str] = [] for pattern in allow_patterns: - hf_weights_files += glob.glob(os.path.join(hf_folder, pattern)) + hf_weights_files += glob.glob(os.path.join(quantized_model_dir, pattern)) if len(hf_weights_files) > 0: if pattern == "*.safetensors": use_safetensors = True break + if not use_safetensors: # Exclude files that are not needed for inference. # https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233 @@ -83,9 +63,9 @@ def _prepare_hf_weights( if len(hf_weights_files) == 0: raise RuntimeError( - f"Cannot find any model weights with `{model_name_or_path}`") + f"Cannot find any model weights with `{quantized_model_dir}`") - return hf_folder, hf_weights_files, use_safetensors + return hf_weights_files, use_safetensors # Adapted from vllm/model_executor/weight_utils.py @@ -112,12 +92,7 @@ def _hf_tensorfile_iterator(filename: str, load_format: str, def main(args): rank_tensors_map = {} - hf_folder, hf_tensor_files, use_safetensors = _prepare_hf_weights( - args.quantized_model, - args.cache_dir, - args.load_format, - revision=args.revision, - fall_back_to_pt=True) + hf_tensor_files, use_safetensors = _prepare_hf_weights(args.quantized_model, args.load_format) # Matches the number immediately after this keyword in the tensor filename to # determine the TP rank corresponding to said tensor file rank_keyword = "rank" @@ -152,7 +127,7 @@ def main(args): module_delimiter = ":" if args.load_format == "npz" else "." for name, param in _hf_tensorfile_iterator(tensor_file, args.load_format, - use_safetensors): + use_safetensors): if "kv_cache_scaling_factor" in name: nums = [int(s) for s in name.split(module_delimiter) if s.isdecimal()] assert len(nums) == 1, f"Could not determine layer idx for {name}" @@ -168,7 +143,7 @@ def main(args): raise if args.output_dir is None: - output_file = os.path.join(hf_folder, args.output_name) + output_file = os.path.join(args.quantized_model, args.output_name) else: output_file = os.path.join(args.output_dir, args.output_name) if not os.path.isdir(args.output_dir): @@ -202,22 +177,15 @@ def main(args): "--kv_cache_scales_path ). This is only used " "if the KV cache dtype is FP8 and on ROCm (AMD GPU).") parser.add_argument("--quantized_model", - help="Specify either the local path to, or name of, a quantized HF model. " + help="Specify the directory containing a single quantized HF model. " "It is expected that the quantization format is FP8_E4M3, for use on ROCm " "(AMD GPU).", required=True) - parser.add_argument("--cache_dir", - help="Optionally specify a cache directory to use in the event of a HF " - "model download.", - default=None) parser.add_argument("--load_format", help="Optionally specify the format of the model's tensor files " "containing the KV cache scaling factors.", choices=["auto", "safetensors", "npz", "pt"], default="auto") - parser.add_argument("--revision", - help="Optionally specify the model's revision number.", - default=None) parser.add_argument("--output_dir", help="Optionally specify the output directory. By default the " "KV cache scaling factors will be saved in the model directory, " diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index 6abae88cfa75e..1acfcf5fd0283 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -308,11 +308,11 @@ def kv_cache_scales_iterator(filename: str, return layer_scales_map.items() except FileNotFoundError: - logger.error(f"File '{filename}' not found.") + logger.error(f"File or directory '{filename}' not found.") except json.JSONDecodeError: logger.error(f"Error decoding JSON in file '{filename}'.") except Exception as e: - logger.error(f"An error occurred while reading file '{filename}': {e}") + logger.error(f"An error occurred while reading '{filename}': {e}") # This section is only reached if any of the excepts are hit # Return an empty iterator (tuple) => no KV cache scales are loaded # which effectively defaults to 1.0 scales From 76665876279c9ffa9125bfd9eb22dd221b565292 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Fri, 23 Feb 2024 22:02:31 +0000 Subject: [PATCH 099/159] Correct a stray type hint --- vllm/model_executor/weight_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index 1acfcf5fd0283..960b427a0eabb 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -264,7 +264,7 @@ def hf_model_weights_iterator( def kv_cache_scales_iterator(filename: str, tp_rank: int, tp_size: int, - num_hidden_layers: int) -> Iterator[Tuple[int, torch.Tensor]]: + num_hidden_layers: int) -> Iterator[Tuple[int, float]]: """ A simple utility to read in KV cache scaling factors that have been previously serialized to disk. Used by the model to populate the appropriate From 7c0bf6e783d7ea1610c30dee32367f1c9653df90 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Mon, 26 Feb 2024 20:28:33 +0000 Subject: [PATCH 100/159] Change convention: Initialize scaling factors always if KV cache is FP8 and exceptions are handled, else terminate program execution --- vllm/model_executor/models/llama.py | 13 +++++++++++++ vllm/worker/model_runner.py | 26 +++++++++++++++----------- 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index b604c7adb5116..cddbd680ea095 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -395,7 +395,14 @@ def load_weights(self, weight_loader(param, loaded_weight) # Should not be called unless the KV cache dtype is FP8 on ROCm (AMD GPU) + # If this function is called, it should always initialize KV cache scale + # factors (or else raise an exception). Thus, handled exceptions should + # make sure to leave KV cache scale factors in a known good (dummy) state def load_kv_cache_scales(self, filename: str) -> None: + # Initialize KV cache scales to dummy values first. These will be + # overwritten by the actual values if and only if the later loading + # process completes without error + self.load_dummy_kv_cache_scales() tp_size = get_tensor_model_parallel_world_size() tp_rank = get_tensor_model_parallel_rank() for layer_idx, scaling_factor in kv_cache_scales_iterator( @@ -409,3 +416,9 @@ def load_kv_cache_scales(self, filename: str) -> None: # scaling_factor = tensor_amax / FPtype_max scaling_factor *= 2 layer_paged_attn.kv_cache_scaling_factor = scaling_factor + + # Should not be called unless the KV cache dtype is FP8 on ROCm (AMD GPU) + def load_dummy_kv_cache_scales(self) -> None: + for layer_idx in range(self.model.config.num_hidden_layers): + layer_paged_attn = self.model.layers[layer_idx].self_attn.attn + setattr(layer_paged_attn, "kv_cache_scaling_factor", 1.0) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index fafca668e8f7e..c9e6ceffc5502 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -110,21 +110,25 @@ def load_model(self) -> None: self.model.embedding_padding_modules) self.model = self.lora_manager.create_lora_manager(self.model) - if self.model_config.kv_cache_scales_path is not None: - if self.kv_cache_dtype == "fp8": + if self.kv_cache_dtype == "fp8": + if self.model_config.kv_cache_scales_path is not None: if callable(getattr(self.model, "load_kv_cache_scales", None)): self.model.load_kv_cache_scales(self.model_config.kv_cache_scales_path) else: - logger.warn("Using FP8 KV cache and scaling factors provided but " - f"model {self.model.__class__} does not support loading " - "scaling factors. Defaulting to scaling factors of 1.0, " - "This may lead to less accurate results!") + raise RuntimeError("Using FP8 KV cache and scaling factors provided but " + f"model {self.model.__class__} does not support loading " + "scaling factors.") + elif callable(getattr(self.model, "load_dummy_kv_cache_scales", None)): + logger.warn(f"Using FP8 KV cache but no scaling factors provided. Defaulting to " + "scaling factors of 1.0, This may lead to less accurate results!") + self.model.load_dummy_kv_cache_scales() else: - logger.warn("KV cache scaling factors provided, but the KV cache data type is not FP8. " - "KV cache scaling factors will not be used.") - elif self.kv_cache_dtype == "fp8": - logger.warn(f"Using FP8 KV cache but no scaling factors provided. Defaulting to " - "scaling factors of 1.0, This may lead to less accurate results!") + raise RuntimeError(f"Using FP8 KV cache but no scaling factors provided and model " + "does not support loading dummy KV cache scaling factors.") + elif self.model_config.kv_cache_scales_path is not None: + logger.warn("KV cache scaling factors provided, but the KV cache data type is not FP8. " + "KV cache scaling factors will not be used.") + def set_block_size(self, block_size: int) -> None: self.block_size = block_size From 42e2aeffede0d60829b2e197dfae641181867873 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 23 Feb 2024 21:18:46 +0000 Subject: [PATCH 101/159] Add example output for extract_scales --- .../llama2-7b-fp8-kv/kv_cache_scales.json | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json diff --git a/tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json b/tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json new file mode 100644 index 0000000000000..3a29f7e321391 --- /dev/null +++ b/tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json @@ -0,0 +1,36 @@ +{ + "0": { + "0": 0.0152239128947258, + "1": 0.0188860222697258, + "2": 0.0354178324341774, + "3": 0.0376674123108387, + "4": 0.0418526791036129, + "5": 0.0433175228536129, + "6": 0.0397600457072258, + "7": 0.0424455925822258, + "8": 0.0415387861430645, + "9": 0.0408412404358387, + "10": 0.0395856611430645, + "11": 0.0377371683716774, + "12": 0.0400739423930645, + "13": 0.040771484375, + "14": 0.0393415205180645, + "15": 0.0369001142680645, + "16": 0.03857421875, + "17": 0.0387486070394516, + "18": 0.0403180830180645, + "19": 0.0396205373108387, + "20": 0.0375627800822258, + "21": 0.0407366082072258, + "22": 0.0432477705180645, + "23": 0.0377022884786129, + "24": 0.0399693101644516, + "25": 0.0374581478536129, + "26": 0.0413295216858387, + "27": 0.0442243330180645, + "28": 0.0424804724752903, + "29": 0.0456891767680645, + "30": 0.0409109964966774, + "31": 0.0482352152466774 + } +} \ No newline at end of file From d3dbb1ae4229d63e2024ee10bf5d19397a5d2e37 Mon Sep 17 00:00:00 2001 From: Adrian Abeyta Date: Fri, 23 Feb 2024 16:45:00 -0600 Subject: [PATCH 102/159] Create README.md and add usage example --- tests/fp8_kv/README.md | 105 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 tests/fp8_kv/README.md diff --git a/tests/fp8_kv/README.md b/tests/fp8_kv/README.md new file mode 100644 index 0000000000000..ccfb24686c341 --- /dev/null +++ b/tests/fp8_kv/README.md @@ -0,0 +1,105 @@ +# FP8 KV Cache Extraction Tool + +This utility extracts the KV cache scaling factors from a quantized HF (Hugging Face) model. The extracted scaling factors are saved to a JSON file, which can later be used by vLLM (variable-length language model) during runtime. This tool is particularly useful when the KV cache data type is FP8 and is intended for use on ROCm (AMD GPU) platforms. + +## Prerequisites + +- Python 3.x +- PyTorch +- NumPy +- Hugging Face Transformers +- Hugging Face Hub +- AMMO + +Before using this tool, you'll need to follow these steps: + +1. Install all necessary prerequisites and dependencies. +2. Convert HF model into a quantized HF model. +3. Extract KV Cache Scaling Factors from quantized HF model. +4. Load KV Cache Scaling Factors into VLLM. + +### 2. Convert HF model into a quantized HF model. +Note: The following steps are adapted from the [TensorRT-LLM repository](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/README.md). +## APIs + +[`quantize.py`](3rdparty/quantizer/quantize.py) uses the quantization toolkit (AMMO) to calibrate the PyTorch models and export TensorRT-LLM checkpoints. Each TensorRT-LLM checkpoint contains a config file (in .json format) and one or several rank weight files (in .safetensors format). See this [`doc`](../../docs/source/new_workflow.md) for more details on the TensorRT-LLM checkpoint format. + +The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found [here](https://github.com/ROCm/vllm-fp8/blob/fp8_kv/3rdparty/README.md). + +### 3. Extract KV Cache Scaling Factors from quantized HF model. +extract_scales.py can be utilized to extract the KV cache scaling factors from your quantized HF model, however at the moment, this tool exclusively supports LLaMa models. It is also important to note the following: +1. **File Structure**: The utility operates under the assumption that all parameters, including KV cache scaling factors, corresponding to a particular Tensor Parallelism (TP) rank are stored in a single file. These files must adhere to a specific naming convention where the TP rank is immediately identified after a specific keyword (e.g., "rank") in the filename. + +2. **TP Decomposition**: The utility assumes consistency between the TP decomposition employed by the quantizer tool and that used by vLLM. + +3. **AMMO Compatibility**: Currently, the generated KV cache scaling factors for AMMO remain uniform across all TP ranks. + +```python +# prerequisites: +# - Quantized HF LLaMa model +python3 3rdparty/quantizer/extract_scales.py --help +Useage: extract_scales.py [-h] --quantized_model QUANTIZED_MODEL [--cache_dir CACHE_DIR] [--load_format {auto,safetensors,npz,pt}] [--revision REVISION] [--output_dir OUTPUT_DIR] [--output_name OUTPUT_NAME] [--tp_size TP_SIZE] + +KV Scale Extraction Example + +optional arguments: +--quantized_model: Specify either the local path to, or name of, a quantized HF model. It is expected that the quantization format is FP8_E4M3, for use on ROCm (AMD GPU). +Optional arguments: + +--cache_dir: Specify a cache directory to use in the event of a HF model download. (Default: None) +--load_format: Specify the format of the model's tensor files containing the KV cache scaling factors. (Choices: auto, safetensors, npz, pt; Default: auto) +--revision: Specify the model's revision number. (Default: None) +--output_dir: Specify the output directory. By default the KV cache scaling factors will be saved in the model directory. (Default: None) +--output_name: Specify the output filename. (Default: kv_cache_scales.json) +--tp_size: Specify the tensor-parallel (TP) size that the quantized model should correspond to. If specified, during KV cache scaling factor extraction the observed TP size will be checked against this and an error will be raised if there is a mismatch. (Default: None) + +Example: +python3 3rdparty/quantizer/extract_scales.py --model --tp_size --output_dir + +### 4. Load KV Cache Scaling Factors into VLLM. +# prerequisites: +# - LLaMa kv_cache_scales.json file + +python3 benchmarks/benchmark_throughput.py --help +usage: benchmark_throughput.py [-h] [--backend {vllm,hf,mii}] [--dataset DATASET] [--input-len INPUT_LEN] [--output-len OUTPUT_LEN] [--model MODEL] + [--tokenizer TOKENIZER] [--quantization {awq,gptq,squeezellm,None}] [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--n N] + [--use-beam-search] [--num-prompts NUM_PROMPTS] [--seed SEED] [--hf-max-batch-size HF_MAX_BATCH_SIZE] [--trust-remote-code] + [--max-model-len MAX_MODEL_LEN] [--dtype {auto,half,float16,bfloat16,float,float32}] [--enforce-eager] [--kv-cache-dtype {auto,fp8}] + [--kv-cache-scales-path KV_CACHE_SCALES_PATH] + +Benchmark Throughput Example +optional arguments: + -h, --help show this help message and exit + --backend {vllm,hf,mii} + --dataset DATASET Path to the dataset. + --input-len INPUT_LEN + Input prompt length for each request + --output-len OUTPUT_LEN + Output length for each request. Overrides the output length from the dataset. + --model MODEL + --tokenizer TOKENIZER + --quantization {awq,gptq,squeezellm,None}, -q {awq,gptq,squeezellm,None} + --tensor-parallel-size TENSOR_PARALLEL_SIZE, -tp TENSOR_PARALLEL_SIZE + --n N Number of generated sequences per prompt. + --use-beam-search + --num-prompts NUM_PROMPTS + Number of prompts to process. + --seed SEED + --hf-max-batch-size HF_MAX_BATCH_SIZE + Maximum batch size for HF backend. + --trust-remote-code trust remote code from huggingface + --max-model-len MAX_MODEL_LEN + Maximum length of a sequence (including prompt and output). If None, will be derived from the model. + --dtype {auto,half,float16,bfloat16,float,float32} + data type for model weights and activations. The "auto" option will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 + models. + --enforce-eager enforce eager execution + --kv-cache-dtype {auto,fp8} + Data type for kv cache storage. If "auto", will use model data type. FP8_E5M2 (without scaling) is only supported on cuda version greater than + 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria. + --kv-cache-scales-path KV_CACHE_SCALES_PATH + Path to the JSON files containing the KV cache scaling factors. This should generally be supplied, when KV cache dtype is FP8. Otherwise, KV cache + scaling factors default to 1.0, which may cause accuracy issues. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. + On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria. +Example: +python3 benchmarks/benchmark_throughput.py --input-len --output-len -tp --kv-cache-dtype fp8 --kv-cache-scales-path From f39839bea34f123692e6f0c6b8f295b7aa4fd98c Mon Sep 17 00:00:00 2001 From: Adrian Abeyta Date: Fri, 23 Feb 2024 16:52:31 -0600 Subject: [PATCH 103/159] Added benchmark description --- tests/fp8_kv/README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/fp8_kv/README.md b/tests/fp8_kv/README.md index ccfb24686c341..da2d5c5c7acf0 100644 --- a/tests/fp8_kv/README.md +++ b/tests/fp8_kv/README.md @@ -55,8 +55,10 @@ Optional arguments: Example: python3 3rdparty/quantizer/extract_scales.py --model --tp_size --output_dir - +``` ### 4. Load KV Cache Scaling Factors into VLLM. +The script evaluates the inference throughput of language models using various backends such as vLLM, HF (Hugging Face). It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for kv cache scaling factors to be utilized for FP8. +```python # prerequisites: # - LLaMa kv_cache_scales.json file @@ -103,3 +105,4 @@ optional arguments: On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria. Example: python3 benchmarks/benchmark_throughput.py --input-len --output-len -tp --kv-cache-dtype fp8 --kv-cache-scales-path +```python From 2cfea655ae6ff81c8c16b03f6c8cacb6ac46f8de Mon Sep 17 00:00:00 2001 From: Adrian Abeyta Date: Fri, 23 Feb 2024 20:00:46 -0600 Subject: [PATCH 104/159] Clean up readme --- tests/fp8_kv/README.md | 62 +++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 37 deletions(-) diff --git a/tests/fp8_kv/README.md b/tests/fp8_kv/README.md index da2d5c5c7acf0..e71a8b58dce69 100644 --- a/tests/fp8_kv/README.md +++ b/tests/fp8_kv/README.md @@ -1,4 +1,4 @@ -# FP8 KV Cache Extraction Tool +# FP8 KV Cache This utility extracts the KV cache scaling factors from a quantized HF (Hugging Face) model. The extracted scaling factors are saved to a JSON file, which can later be used by vLLM (variable-length language model) during runtime. This tool is particularly useful when the KV cache data type is FP8 and is intended for use on ROCm (AMD GPU) platforms. @@ -11,8 +11,7 @@ This utility extracts the KV cache scaling factors from a quantized HF (Hugging - Hugging Face Hub - AMMO -Before using this tool, you'll need to follow these steps: - +Before incorporating the FP8 datatype for inference workloads, you must adhere to the following steps: 1. Install all necessary prerequisites and dependencies. 2. Convert HF model into a quantized HF model. 3. Extract KV Cache Scaling Factors from quantized HF model. @@ -20,14 +19,13 @@ Before using this tool, you'll need to follow these steps: ### 2. Convert HF model into a quantized HF model. Note: The following steps are adapted from the [TensorRT-LLM repository](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/README.md). -## APIs -[`quantize.py`](3rdparty/quantizer/quantize.py) uses the quantization toolkit (AMMO) to calibrate the PyTorch models and export TensorRT-LLM checkpoints. Each TensorRT-LLM checkpoint contains a config file (in .json format) and one or several rank weight files (in .safetensors format). See this [`doc`](../../docs/source/new_workflow.md) for more details on the TensorRT-LLM checkpoint format. +[`quantize.py`](https://github.com/ROCm/vllm-fp8/blob/fp8_doc/3rdparty/quantizer/quantize.py) uses the quantization toolkit (AMMO) to calibrate the PyTorch models and export TensorRT-LLM checkpoints. Each TensorRT-LLM checkpoint contains a config file (in .json format) and one or several rank weight files (in .safetensors format). The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found [here](https://github.com/ROCm/vllm-fp8/blob/fp8_kv/3rdparty/README.md). ### 3. Extract KV Cache Scaling Factors from quantized HF model. -extract_scales.py can be utilized to extract the KV cache scaling factors from your quantized HF model, however at the moment, this tool exclusively supports LLaMa models. It is also important to note the following: +[`extract_scales.py`](https://github.com/ROCm/vllm-fp8/blob/fp8_doc/3rdparty/quantizer/extract_scales.py) can be utilized to extract the KV cache scaling factors from your quantized HF model, however at the moment, this tool exclusively supports Llama 2 models. It is also important to note the following: 1. **File Structure**: The utility operates under the assumption that all parameters, including KV cache scaling factors, corresponding to a particular Tensor Parallelism (TP) rank are stored in a single file. These files must adhere to a specific naming convention where the TP rank is immediately identified after a specific keyword (e.g., "rank") in the filename. 2. **TP Decomposition**: The utility assumes consistency between the TP decomposition employed by the quantizer tool and that used by vLLM. @@ -36,7 +34,7 @@ extract_scales.py can be utilized to extract the KV cache scaling factors from y ```python # prerequisites: -# - Quantized HF LLaMa model +# - Quantized HF LLaMa 2 model python3 3rdparty/quantizer/extract_scales.py --help Useage: extract_scales.py [-h] --quantized_model QUANTIZED_MODEL [--cache_dir CACHE_DIR] [--load_format {auto,safetensors,npz,pt}] [--revision REVISION] [--output_dir OUTPUT_DIR] [--output_name OUTPUT_NAME] [--tp_size TP_SIZE] @@ -45,22 +43,22 @@ KV Scale Extraction Example optional arguments: --quantized_model: Specify either the local path to, or name of, a quantized HF model. It is expected that the quantization format is FP8_E4M3, for use on ROCm (AMD GPU). Optional arguments: - --cache_dir: Specify a cache directory to use in the event of a HF model download. (Default: None) --load_format: Specify the format of the model's tensor files containing the KV cache scaling factors. (Choices: auto, safetensors, npz, pt; Default: auto) --revision: Specify the model's revision number. (Default: None) --output_dir: Specify the output directory. By default the KV cache scaling factors will be saved in the model directory. (Default: None) --output_name: Specify the output filename. (Default: kv_cache_scales.json) --tp_size: Specify the tensor-parallel (TP) size that the quantized model should correspond to. If specified, during KV cache scaling factor extraction the observed TP size will be checked against this and an error will be raised if there is a mismatch. (Default: None) - +``` +```python Example: -python3 3rdparty/quantizer/extract_scales.py --model --tp_size --output_dir +python3 3rdparty/quantizer/extract_scales.py --model --tp_size --output_dir ``` ### 4. Load KV Cache Scaling Factors into VLLM. -The script evaluates the inference throughput of language models using various backends such as vLLM, HF (Hugging Face). It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for kv cache scaling factors to be utilized for FP8. +This script evaluates the inference throughput of language models using various backends such as vLLM. It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for kv cache scaling factors to be utilized for FP8. ```python # prerequisites: -# - LLaMa kv_cache_scales.json file +# - LLaMa 2 kv_cache_scales.json file python3 benchmarks/benchmark_throughput.py --help usage: benchmark_throughput.py [-h] [--backend {vllm,hf,mii}] [--dataset DATASET] [--input-len INPUT_LEN] [--output-len OUTPUT_LEN] [--model MODEL] @@ -71,38 +69,28 @@ usage: benchmark_throughput.py [-h] [--backend {vllm,hf,mii}] [--dataset DATASET Benchmark Throughput Example optional arguments: - -h, --help show this help message and exit + -h, --help show this help message and exit --backend {vllm,hf,mii} - --dataset DATASET Path to the dataset. - --input-len INPUT_LEN - Input prompt length for each request - --output-len OUTPUT_LEN - Output length for each request. Overrides the output length from the dataset. + --dataset DATASET Path to the dataset. + --input-len INPUT_LEN Input prompt length for each request + --output-len OUTPUT_LEN Output length for each request. Overrides the output length from the dataset. --model MODEL --tokenizer TOKENIZER --quantization {awq,gptq,squeezellm,None}, -q {awq,gptq,squeezellm,None} --tensor-parallel-size TENSOR_PARALLEL_SIZE, -tp TENSOR_PARALLEL_SIZE - --n N Number of generated sequences per prompt. + --n N Number of generated sequences per prompt. --use-beam-search - --num-prompts NUM_PROMPTS - Number of prompts to process. + --num-prompts NUM_PROMPTS Number of prompts to process. --seed SEED - --hf-max-batch-size HF_MAX_BATCH_SIZE - Maximum batch size for HF backend. - --trust-remote-code trust remote code from huggingface - --max-model-len MAX_MODEL_LEN - Maximum length of a sequence (including prompt and output). If None, will be derived from the model. - --dtype {auto,half,float16,bfloat16,float,float32} - data type for model weights and activations. The "auto" option will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 - models. - --enforce-eager enforce eager execution - --kv-cache-dtype {auto,fp8} - Data type for kv cache storage. If "auto", will use model data type. FP8_E5M2 (without scaling) is only supported on cuda version greater than - 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria. - --kv-cache-scales-path KV_CACHE_SCALES_PATH - Path to the JSON files containing the KV cache scaling factors. This should generally be supplied, when KV cache dtype is FP8. Otherwise, KV cache - scaling factors default to 1.0, which may cause accuracy issues. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. - On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria. + --hf-max-batch-size HF_MAX_BATCH_SIZE Maximum batch size for HF backend. + --trust-remote-code trust remote code from huggingface + --max-model-len MAX_MODEL_LEN Maximum length of a sequence (including prompt and output). If None, will be derived from the model. + --dtype {auto,half,float16,bfloat16,float,float32} data type for model weights and activations. The "auto" option will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models. + --enforce-eager enforce eager execution + --kv-cache-dtype {auto,fp8} Data type for kv cache storage. If "auto", will use model data type. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported ```for common inference criteria. + --kv-cache-scales-path KV_CACHE_SCALES_PATH Path to the JSON files containing the KV cache scaling factors. This should generally be supplied, when KV cache dtype is FP8. Otherwise, KV cache scaling factors default to 1.0, which may cause accuracy issues. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria. +``` +``` Example: python3 benchmarks/benchmark_throughput.py --input-len --output-len -tp --kv-cache-dtype fp8 --kv-cache-scales-path ```python From 257a7da4413606ee5740a2582a1f814bb9713f81 Mon Sep 17 00:00:00 2001 From: Adrian Abeyta Date: Mon, 26 Feb 2024 15:47:40 -0600 Subject: [PATCH 105/159] Updated example descriptions --- tests/fp8_kv/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/fp8_kv/README.md b/tests/fp8_kv/README.md index e71a8b58dce69..fde6e9d6187c3 100644 --- a/tests/fp8_kv/README.md +++ b/tests/fp8_kv/README.md @@ -52,7 +52,7 @@ Optional arguments: ``` ```python Example: -python3 3rdparty/quantizer/extract_scales.py --model --tp_size --output_dir +python3 3rdparty/quantizer/extract_scales.py --model --tp_size --output_dir ``` ### 4. Load KV Cache Scaling Factors into VLLM. This script evaluates the inference throughput of language models using various backends such as vLLM. It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for kv cache scaling factors to be utilized for FP8. @@ -92,5 +92,5 @@ optional arguments: ``` ``` Example: -python3 benchmarks/benchmark_throughput.py --input-len --output-len -tp --kv-cache-dtype fp8 --kv-cache-scales-path +python3 benchmarks/benchmark_throughput.py --input-len --output-len -tp --kv-cache-dtype fp8 --kv-cache-scales-path ```python From 730562a63c97071d9b6c840f41053ad4f98929bb Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Tue, 27 Feb 2024 20:20:45 +0000 Subject: [PATCH 106/159] Update KV cache scales loader name to clarify that we are not using an iterator at present --- vllm/model_executor/models/llama.py | 4 ++-- vllm/model_executor/weight_utils.py | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index cddbd680ea095..7ada55fab3876 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -46,7 +46,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.weight_utils import (default_weight_loader, hf_model_weights_iterator, - kv_cache_scales_iterator) + kv_cache_scales_loader) from vllm.sequence import SamplerOutput from vllm.config import LoRAConfig from vllm.utils import is_hip @@ -405,7 +405,7 @@ def load_kv_cache_scales(self, filename: str) -> None: self.load_dummy_kv_cache_scales() tp_size = get_tensor_model_parallel_world_size() tp_rank = get_tensor_model_parallel_rank() - for layer_idx, scaling_factor in kv_cache_scales_iterator( + for layer_idx, scaling_factor in kv_cache_scales_loader( filename, tp_rank, tp_size, self.model.config.num_hidden_layers): layer_paged_attn = self.model.layers[layer_idx].self_attn.attn diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index 960b427a0eabb..33aae4a1ea733 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -261,7 +261,7 @@ def hf_model_weights_iterator( torch.cuda.empty_cache() -def kv_cache_scales_iterator(filename: str, +def kv_cache_scales_loader(filename: str, tp_rank: int, tp_size: int, num_hidden_layers: int) -> Iterator[Tuple[int, float]]: @@ -276,12 +276,12 @@ def kv_cache_scales_iterator(filename: str, """ try: with open(filename) as f: - # For now we do not obtain any of the benefits of iterators - # but since the number of layers = number of scales is typically - # small, this is not a concern. Loading and processing the entire - # dictionary at once allows us to do sanity checks all at once and - # avoid a situation where we have to abort after having partially - # loaded scaling factors + # Loading and processing the entire dictionary at once allows us + # to do sanity checks all at once and avoid a situation where we + # have to abort after having partially loaded scaling factors + # Since the number of layers is small and (for now) we use scalar + # scaling factors (so the size they use is also small), this is + # not a concern at present. raw_rank_map = json.load(f, parse_int=int, parse_constant=float) # If any of the inputs are malformed, it raises an error somewhere From f20eceb95d57c09a5eb060759b3fb974f51b490c Mon Sep 17 00:00:00 2001 From: HaiShaw Date: Mon, 26 Feb 2024 23:40:31 -0500 Subject: [PATCH 107/159] Kernel and Device functions to enable FP8 KV cache scaling factors --- csrc/attention/attention_kernels.cu | 76 +++--- csrc/cache.h | 3 +- csrc/cache_kernels.cu | 17 +- csrc/ops.h | 6 +- .../fp8/amd_detail/quant_utils.cuh | 227 ++++++++++++++++++ vllm/model_executor/layers/attention.py | 6 + 6 files changed, 292 insertions(+), 43 deletions(-) diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index 173b21ed89fc8..aa176d4c4d74a 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -111,7 +111,8 @@ __device__ void paged_attention_kernel( const float* __restrict__ alibi_slopes, // [num_heads] const int q_stride, const int kv_block_stride, - const int kv_head_stride) { + const int kv_head_stride, + const float kv_scale) { const int seq_idx = blockIdx.y; const int partition_idx = blockIdx.z; const int max_num_partitions = gridDim.z; @@ -231,8 +232,8 @@ __device__ void paged_attention_kernel( k_vecs[j] = fp8_e5m2_unscaled::vec_conversion(k_vec_quant); #elif defined(ENABLE_FP8_E4M3) Quant_vec k_vec_quant = *reinterpret_cast(k_ptr + offset1 * BLOCK_SIZE * x + offset2); - // Vector conversion from Quant_vec to K_vec. - k_vecs[j] = fp8_e4m3::vec_conversion(k_vec_quant); + // Vector conversion from Quant_vec to K_vec. Scaled conversion: FP8 => higher precision + k_vecs[j] = fp8_e4m3::scaled_vec_conversion(k_vec_quant, kv_scale); #else assert(false); #endif @@ -355,9 +356,8 @@ __device__ void paged_attention_kernel( v_vec = fp8_e5m2_unscaled::vec_conversion(v_quant_vec); #elif defined(ENABLE_FP8_E4M3) V_quant_vec v_quant_vec = *reinterpret_cast(v_ptr + offset); - // Vector conversion from V_quant_vec to V_vec. - v_vec = fp8_e4m3::vec_conversion(v_quant_vec); ->>>>>>> Add e4m3 to attention kernels + // Vector conversion from V_quant_vec to V_vec. Scaled conversion: FP8 => higher precision + v_vec = fp8_e4m3::scaled_vec_conversion(v_quant_vec, kv_scale); #else assert(false); #endif @@ -462,11 +462,12 @@ __global__ void paged_attention_v1_kernel( const float* __restrict__ alibi_slopes, // [num_heads] const int q_stride, const int kv_block_stride, - const int kv_head_stride) { + const int kv_head_stride, + const float kv_scale) { paged_attention_kernel( /* exp_sums */ nullptr, /* max_logits */ nullptr, out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, context_lens, - max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride); + max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride, kv_scale); } // Grid: (num_heads, num_seqs, max_num_partitions). @@ -493,11 +494,12 @@ __global__ void paged_attention_v2_kernel( const float* __restrict__ alibi_slopes, // [num_heads] const int q_stride, const int kv_block_stride, - const int kv_head_stride) { + const int kv_head_stride, + const float kv_scale) { paged_attention_kernel( exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, context_lens, max_num_blocks_per_seq, alibi_slopes, - q_stride, kv_block_stride, kv_head_stride); + q_stride, kv_block_stride, kv_head_stride, kv_scale); } // Grid: (num_heads, num_seqs). @@ -604,9 +606,9 @@ __global__ void paged_attention_v2_reduce_kernel( #define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \ VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \ ((void*)vllm::paged_attention_v1_kernel), shared_mem_size); \ + IS_FP8_KV_CACHE>), shared_mem_size); \ vllm::paged_attention_v1_kernel<<>>( \ + IS_FP8_KV_CACHE><<>>( \ out_ptr, \ query_ptr, \ key_cache_ptr, \ @@ -619,7 +621,8 @@ __global__ void paged_attention_v2_reduce_kernel( alibi_slopes_ptr, \ q_stride, \ kv_block_stride, \ - kv_head_stride); + kv_head_stride, \ + kv_scale); // TODO(woosuk): Tune NUM_THREADS. template< @@ -638,7 +641,8 @@ void paged_attention_v1_launcher( torch::Tensor& block_tables, torch::Tensor& context_lens, int max_context_len, - const c10::optional& alibi_slopes) { + const c10::optional& alibi_slopes, + float kv_scale) { int num_seqs = query.size(0); int num_heads = query.size(1); int head_size = query.size(2); @@ -702,8 +706,8 @@ void paged_attention_v1_launcher( } } -#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ - paged_attention_v1_launcher( \ +#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ + paged_attention_v1_launcher( \ out, \ query, \ key_cache, \ @@ -713,20 +717,21 @@ void paged_attention_v1_launcher( block_tables, \ context_lens, \ max_context_len, \ - alibi_slopes); + alibi_slopes, \ + kv_scale); // NOTE(woosuk): To reduce the compilation time, we omitted block sizes // 1, 2, 4, 64, 128, 256. -#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_KV_CACHE) \ +#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_KV_CACHE) \ switch (block_size) { \ case 8: \ - CALL_V1_LAUNCHER(T, CACHE_T, 8, IS_FP8_KV_CACHE); \ + CALL_V1_LAUNCHER(T, CACHE_T, 8, IS_FP8_KV_CACHE); \ break; \ case 16: \ - CALL_V1_LAUNCHER(T, CACHE_T, 16, IS_FP8_KV_CACHE); \ + CALL_V1_LAUNCHER(T, CACHE_T, 16, IS_FP8_KV_CACHE); \ break; \ case 32: \ - CALL_V1_LAUNCHER(T, CACHE_T, 32, IS_FP8_KV_CACHE); \ + CALL_V1_LAUNCHER(T, CACHE_T, 32, IS_FP8_KV_CACHE); \ break; \ default: \ TORCH_CHECK(false, "Unsupported block size: ", block_size); \ @@ -745,7 +750,8 @@ void paged_attention_v1( int block_size, int max_context_len, const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype) { + const std::string& kv_cache_dtype, + float kv_scale) { if (kv_cache_dtype == "auto") { if (query.dtype() == at::ScalarType::Float) { CALL_V1_LAUNCHER_BLOCK_SIZE(float, float, false); @@ -773,7 +779,7 @@ void paged_attention_v1( #define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \ vllm::paged_attention_v2_kernel \ + IS_FP8_KV_CACHE, PARTITION_SIZE> \ <<>>( \ exp_sums_ptr, \ max_logits_ptr, \ @@ -789,7 +795,8 @@ void paged_attention_v1( alibi_slopes_ptr, \ q_stride, \ kv_block_stride, \ - kv_head_stride); \ + kv_head_stride, \ + kv_scale); \ vllm::paged_attention_v2_reduce_kernel \ <<>>( \ out_ptr, \ @@ -819,7 +826,8 @@ void paged_attention_v2_launcher( torch::Tensor& block_tables, torch::Tensor& context_lens, int max_context_len, - const c10::optional& alibi_slopes) { + const c10::optional& alibi_slopes, + float kv_scale) { int num_seqs = query.size(0); int num_heads = query.size(1); int head_size = query.size(2); @@ -889,8 +897,8 @@ void paged_attention_v2_launcher( } } -#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ - paged_attention_v2_launcher( \ +#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ + paged_attention_v2_launcher( \ out, \ exp_sums, \ max_logits, \ @@ -903,20 +911,21 @@ void paged_attention_v2_launcher( block_tables, \ context_lens, \ max_context_len, \ - alibi_slopes); + alibi_slopes, \ + kv_scale); // NOTE(woosuk): To reduce the compilation time, we omitted block sizes // 1, 2, 4, 64, 128, 256. -#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_KV_CACHE) \ +#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_KV_CACHE) \ switch (block_size) { \ case 8: \ - CALL_V2_LAUNCHER(T, CACHE_T, 8, IS_FP8_KV_CACHE); \ + CALL_V2_LAUNCHER(T, CACHE_T, 8, IS_FP8_KV_CACHE); \ break; \ case 16: \ - CALL_V2_LAUNCHER(T, CACHE_T, 16, IS_FP8_KV_CACHE); \ + CALL_V2_LAUNCHER(T, CACHE_T, 16, IS_FP8_KV_CACHE); \ break; \ case 32: \ - CALL_V2_LAUNCHER(T, CACHE_T, 32, IS_FP8_KV_CACHE); \ + CALL_V2_LAUNCHER(T, CACHE_T, 32, IS_FP8_KV_CACHE); \ break; \ default: \ TORCH_CHECK(false, "Unsupported block size: ", block_size); \ @@ -938,7 +947,8 @@ void paged_attention_v2( int block_size, int max_context_len, const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype) { + const std::string& kv_cache_dtype, + float kv_scale) { if (kv_cache_dtype == "auto") { if (query.dtype() == at::ScalarType::Float) { CALL_V2_LAUNCHER_BLOCK_SIZE(float, float, false); diff --git a/csrc/cache.h b/csrc/cache.h index 7b9baa2ea97f5..718a5f6cfd7f7 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -21,7 +21,8 @@ void reshape_and_cache( torch::Tensor& key_cache, torch::Tensor& value_cache, torch::Tensor& slot_mapping, - const std::string& kv_cache_dtype); + const std::string& kv_cache_dtype, + const float kv_scale); // Just for unittest void convert_fp8( diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 3e231b414b377..24aaa2ff3e263 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -165,7 +165,8 @@ __global__ void reshape_and_cache_kernel( const int num_heads, const int head_size, const int block_size, - const int x) { + const int x, + const float kv_scale) { const int64_t token_idx = blockIdx.x; const int64_t slot_idx = slot_mapping[token_idx]; if (slot_idx < 0) { @@ -202,8 +203,8 @@ __global__ void reshape_and_cache_kernel( key_cache[tgt_key_idx] = fp8_e5m2_unscaled::vec_conversion(tgt_key); value_cache[tgt_value_idx] = fp8_e5m2_unscaled::vec_conversion(tgt_value); #elif defined(ENABLE_FP8_E4M3) - key_cache[tgt_key_idx] = fp8_e4m3::vec_conversion(tgt_key); - value_cache[tgt_value_idx] = fp8_e4m3::vec_conversion(tgt_value); + key_cache[tgt_key_idx] = fp8_e4m3::scaled_vec_conversion(tgt_key, kv_scale); + value_cache[tgt_value_idx] = fp8_e4m3::scaled_vec_conversion(tgt_value, kv_scale); #else assert(false); #endif @@ -216,8 +217,8 @@ __global__ void reshape_and_cache_kernel( } // namespace vllm -#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, IS_FP8_KV_CACHE) \ - vllm::reshape_and_cache_kernel<<>>( \ +#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, IS_FP8_KV_CACHE) \ + vllm::reshape_and_cache_kernel<<>>( \ reinterpret_cast(key.data_ptr()), \ reinterpret_cast(value.data_ptr()), \ reinterpret_cast(key_cache.data_ptr()), \ @@ -228,7 +229,8 @@ __global__ void reshape_and_cache_kernel( num_heads, \ head_size, \ block_size, \ - x); + x, \ + kv_scale); void reshape_and_cache( torch::Tensor& key, // [num_tokens, num_heads, head_size] @@ -236,7 +238,8 @@ void reshape_and_cache( torch::Tensor& key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] torch::Tensor& value_cache, // [num_blocks, num_heads, head_size, block_size] torch::Tensor& slot_mapping, // [num_tokens] - const std::string& kv_cache_dtype) + const std::string& kv_cache_dtype, + const float kv_scale) { int num_tokens = key.size(0); int num_heads = key.size(1); diff --git a/csrc/ops.h b/csrc/ops.h index 249c7451bf73c..831a5f003f2ed 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -14,7 +14,8 @@ void paged_attention_v1( int block_size, int max_context_len, const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype); + const std::string& kv_cache_dtype, + float kv_scale); void paged_attention_v2( torch::Tensor& out, @@ -31,7 +32,8 @@ void paged_attention_v2( int block_size, int max_context_len, const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype); + const std::string& kv_cache_dtype, + float kv_scale); void rms_norm( torch::Tensor& out, diff --git a/csrc/quantization/fp8/amd_detail/quant_utils.cuh b/csrc/quantization/fp8/amd_detail/quant_utils.cuh index 7bc70d9264ab8..d49b67425c95a 100644 --- a/csrc/quantization/fp8/amd_detail/quant_utils.cuh +++ b/csrc/quantization/fp8/amd_detail/quant_utils.cuh @@ -17,6 +17,12 @@ __inline__ __device__ Tout vec_conversion(const Tin& x) return x; } +template +__inline__ __device__ Tout scaled_vec_conversion(const Tin& x, const float scale) +{ + return x; +} + // fp8 -> half template <> __inline__ __device__ uint16_t vec_conversion(const uint8_t& a) @@ -215,6 +221,7 @@ __inline__ __device__ float4 vec_conversion(const uint32_t& a) return res; } +// float2 -> half2 template <> __inline__ __device__ uint32_t vec_conversion(const float2& a) { @@ -227,6 +234,7 @@ __inline__ __device__ uint32_t vec_conversion(const float2& a) return uint32; } +// Float4 -> half2x2 template <> __inline__ __device__ uint2 vec_conversion(const Float4_& a) { @@ -242,6 +250,7 @@ __inline__ __device__ uint2 vec_conversion(const Float4_& a) return b; } +// Float4 -> float4 template <> __inline__ __device__ float4 vec_conversion(const Float4_& a) { @@ -253,6 +262,7 @@ __inline__ __device__ float4 vec_conversion(const Float4_& a) return b; } +// Float8 -> half2x4 template <> __inline__ __device__ uint4 vec_conversion(const Float8_& a) { @@ -264,6 +274,7 @@ __inline__ __device__ uint4 vec_conversion(const Float8_& a) return b; } +// float2 -> bfloat162 template <> __inline__ __device__ __nv_bfloat162 vec_conversion<__nv_bfloat162, float2>(const float2& a) { @@ -271,6 +282,7 @@ __inline__ __device__ __nv_bfloat162 vec_conversion<__nv_bfloat162, float2>(cons return b; } +// Float4 -> bfloat162x2 template <> __inline__ __device__ bf16_4_t vec_conversion(const Float4_& a) { @@ -280,6 +292,7 @@ __inline__ __device__ bf16_4_t vec_conversion(const Float4_& return b; } +// Float8 -> bfloat162x4 template <> __inline__ __device__ bf16_8_t vec_conversion(const Float8_& a) { @@ -290,5 +303,219 @@ __inline__ __device__ bf16_8_t vec_conversion(const Float8_& b.w = __float22bfloat162_rn(a.w); return b; } + + +/* Scaled and vectorized conversions, for data exchange between high and low precision domains + + Convention of the scale in API, e.g: FP8_data = Quantization( High_Precision_data / scale ) + s.t. + Quantize(HP / scale) => FP8 + Dequant(FP8) * scale => HP + + */ + +// fp8 -> half +template <> +__inline__ __device__ uint16_t scaled_vec_conversion(const uint8_t& a, const float scale) +{ + hip_fp8 f8{a, hip_fp8::from_bits()}; + __half_raw res; + res.data = static_cast(f8) * scale; + return res.x; +} + +// fp8x2 -> half2 +template <> +__inline__ __device__ uint32_t scaled_vec_conversion(const uint16_t& a, const float scale) +{ +#if defined(__HIP__MI300__) && defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__) + const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0); + union { + __half2_raw h2r; + uint32_t ui32; + } tmp; + tmp.h2r.x.data = f2[0] * scale; + tmp.h2r.y.data = f2[1] * scale; + return tmp.ui32; +#else + union { + uint16_t u16[2]; + uint32_t u32; + } tmp; + + tmp.u16[0] = scaled_vec_conversion(static_cast(a), scale); + tmp.u16[1] = scaled_vec_conversion(static_cast(a >> 8U), scale); + return tmp.u32; +#endif +} + +// fp8x4 -> half2x2 +template <> +__inline__ __device__ uint2 scaled_vec_conversion(const uint32_t& a, const float scale) +{ + union { + uint2 u32x2; + uint32_t u32[2]; + } tmp; + tmp.u32[0] = scaled_vec_conversion((uint16_t)a, scale); + tmp.u32[1] = scaled_vec_conversion((uint16_t)(a >> 16U), scale); + return tmp.u32x2; +} + +// fp8x8 -> half2x4 +template <> +__inline__ __device__ uint4 scaled_vec_conversion(const uint2& a, const float scale) +{ + union { + uint4 u64x2; + uint2 u64[2]; + } tmp; + tmp.u64[0] = scaled_vec_conversion(a.x, scale); + tmp.u64[1] = scaled_vec_conversion(a.y, scale); + return tmp.u64x2; +} + +using __nv_bfloat16 = __hip_bfloat16; + +// fp8 -> __nv_bfloat16 +template <> +__inline__ __device__ __nv_bfloat16 scaled_vec_conversion<__nv_bfloat16, uint8_t>(const uint8_t& a, const float scale) +{ + hip_fp8 f8{a, hip_fp8::from_bits()}; + float f{f8}; + return __float2bfloat16(f * scale); +} + +using __nv_bfloat162 = __hip_bfloat162; + +// fp8x2 -> __nv_bfloat162 +template <> +__inline__ __device__ __nv_bfloat162 scaled_vec_conversion<__nv_bfloat162, uint16_t>(const uint16_t& a, const float scale) +{ + __nv_bfloat162 res; + res.x = scaled_vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)a, scale); + res.y = scaled_vec_conversion<__nv_bfloat16, uint8_t>((uint8_t)(a >> 8U), scale); + return res; +} + +// fp8x4 -> bf16_4_t +template <> +__inline__ __device__ bf16_4_t scaled_vec_conversion(const uint32_t& a, const float scale) +{ + bf16_4_t res; + res.x = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)a, scale); + res.y = scaled_vec_conversion<__nv_bfloat162, uint16_t>((uint16_t)(a >> 16U), scale); + return res; +} + +// fp8x8 -> bf16_8_t +template <> +__inline__ __device__ bf16_8_t scaled_vec_conversion(const uint2& a, const float scale) +{ + bf16_4_t tmp1, tmp2; + tmp1 = scaled_vec_conversion(a.x, scale); + tmp2 = scaled_vec_conversion(a.y, scale); + bf16_8_t res; + res.x = tmp1.x; + res.y = tmp1.y; + res.z = tmp2.x; + res.w = tmp2.y; + return res; +} + +// fp8 -> float +template <> +__inline__ __device__ float scaled_vec_conversion(const uint8_t& a, const float scale) +{ + hip_fp8 fp8{a, hip_fp8::from_bits()}; + return static_cast(fp8) * scale; +} + +// fp8x2 -> float2 +template <> +__inline__ __device__ float2 scaled_vec_conversion(const uint16_t& a, const float scale) +{ +#if defined(__HIP__MI300__) && defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__) + float2 res; + const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0); + //res.x = vec_conversion(static_cast(a)); + //res.y = vec_conversion(static_cast(a >> 8U)); + res.x = f2[0] * scale; + res.y = f2[1] * scale; + return res; +#else + float2 res; + res.x = scaled_vec_conversion(static_cast(a), scale); + res.y = scaled_vec_conversion(static_cast(a >> 8U), scale); + return res; +#endif +} + +// fp8x4 -> float4 +template <> +__inline__ __device__ Float4_ scaled_vec_conversion(const uint32_t& a, const float scale) +{ + Float4_ res; + res.x = scaled_vec_conversion((uint16_t)a, scale); + res.y = scaled_vec_conversion((uint16_t)(a >> 16U), scale); + return res; +} + +// fp8x8 -> float8 +template <> +__inline__ __device__ Float8_ scaled_vec_conversion(const uint2& a, const float scale) +{ + Float4_ tmp1, tmp2; + tmp1 = scaled_vec_conversion(a.x, scale); + tmp2 = scaled_vec_conversion(a.y, scale); + Float8_ res; + res.x = tmp1.x; + res.y = tmp1.y; + res.z = tmp2.x; + res.w = tmp2.y; + return res; +} + + +/* Quantize(HP / scale) => FP8 */ + +// TODO(Hai): vectorized to add + +// half -> fp8 +template <> +__inline__ __device__ uint8_t scaled_vec_conversion(const uint16_t& a, const float scale) +{ + __half_raw tmp; + tmp.x = a; + + hip_fp8 f8{static_cast(tmp.data)/scale}; + return f8.data; +} + +// bf16 -> fp8 +template <> +__inline__ __device__ uint8_t scaled_vec_conversion(const __nv_bfloat16& a, const float scale) +{ + hip_fp8 res{__bfloat162float(a)/scale}; + return res.data; +} + +// float -> fp8 +template <> +__inline__ __device__ uint8_t scaled_vec_conversion(const float& a, const float scale) +{ + hip_fp8 f8(a/scale); + return f8.data; +} + +// fp8x4 -> float4 +template <> +__inline__ __device__ float4 scaled_vec_conversion(const uint32_t& a, const float scale) +{ + Float4_ tmp = scaled_vec_conversion(a, scale); + float4 res = make_float4(tmp.x.x, tmp.x.y, tmp.y.x, tmp.y.y); + return res; +} + } } // namespace vllm diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index 2c775eed0e665..d6d5ee9ca9066 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -144,6 +144,7 @@ def forward( value_cache, input_metadata.slot_mapping.flatten(), input_metadata.kv_cache_dtype, + self.kv_cache_scaling_factor, ) if input_metadata.is_prompt: @@ -218,6 +219,7 @@ def forward( output = out.view_as(query) else: # prefix-enabled attention + # TODO(Hai) this triton kernel has regression issue with FP8 KVCache to handle mixed types output = torch.empty_like(query) context_attention_fwd( query, @@ -244,6 +246,7 @@ def forward( self.num_kv_heads, self.scale, self.alibi_slopes, + self.kv_cache_scaling_factor, ) # Reshape the output tensor. return output.view(batch_size, seq_len, hidden_size) @@ -291,6 +294,7 @@ def _paged_attention( num_kv_heads: int, scale: float, alibi_slopes: Optional[torch.Tensor], + kv_scale: float, ) -> torch.Tensor: output = torch.empty_like(query) @@ -323,6 +327,7 @@ def _paged_attention( input_metadata.max_context_len, alibi_slopes, input_metadata.kv_cache_dtype, + kv_scale, ) else: # Run PagedAttention V2. @@ -354,5 +359,6 @@ def _paged_attention( input_metadata.max_context_len, alibi_slopes, input_metadata.kv_cache_dtype, + kv_scale, ) return output From 8834917423c032d9ef4e61d86f30bfb11ab0a578 Mon Sep 17 00:00:00 2001 From: HaiShaw Date: Tue, 27 Feb 2024 13:25:24 -0500 Subject: [PATCH 108/159] Make KV cache scaling factors default to 1.0 instead of None --- vllm/model_executor/layers/attention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index d6d5ee9ca9066..6e59fa2af3023 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -60,7 +60,7 @@ def __init__( # quantized_value * scaling_factor ~= true_value # which is consistent with the practice of setting # scaling_factor = tensor_amax / FPtype_max - self.kv_cache_scaling_factor = None + self.kv_cache_scaling_factor = 1.0 assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads From 31875822d0b2a950b88d5bef7f8cdd298b0b5f32 Mon Sep 17 00:00:00 2001 From: HaiShaw Date: Tue, 27 Feb 2024 20:59:37 -0500 Subject: [PATCH 109/159] Fix test cases from the introduction of KV cache scaling factors, using default kv_scale=1.0 --- benchmarks/kernels/benchmark_paged_attention.py | 5 +++++ tests/kernels/test_attention.py | 5 +++++ tests/kernels/test_cache.py | 5 ++++- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 575ca95e772c4..c1d6dacc6a0d4 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -97,6 +97,9 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: torch.cuda.cudart().cudaProfilerStart() start_time = time.perf_counter() + # Using default kv_scale + kv_scale = 1.0 + for _ in range(num_iters): if version == "v1": ops.paged_attention_v1( @@ -112,6 +115,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: max_context_len, alibi_slopes, kv_cache_dtype, + kv_scale, ) elif version == "v2": ops.paged_attention_v2( @@ -130,6 +134,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: max_context_len, alibi_slopes, kv_cache_dtype, + kv_scale, ) else: raise ValueError(f"Invalid version: {version}") diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 3b9079a74e621..d19fc4720d376 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -173,6 +173,9 @@ def test_paged_attention( device) key_cache, value_cache = key_caches[0], value_caches[0] + # Using default kv_scale + kv_scale = 1.0 + # Call the paged attention kernel. output = torch.empty_like(query) if version == "v1": @@ -189,6 +192,7 @@ def test_paged_attention( max_context_len, alibi_slopes, kv_cache_dtype, + kv_scale, ) elif version == "v2": num_partitions = ((max_context_len + PARTITION_SIZE - 1) // @@ -220,6 +224,7 @@ def test_paged_attention( max_context_len, alibi_slopes, kv_cache_dtype, + kv_scale, ) else: raise AssertionError(f"Unknown version: {version}") diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index f69b44cc8706e..7f87a8e31af1b 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -147,9 +147,12 @@ def test_reshape_and_cache( cloned_key_cache = key_cache.clone() cloned_value_cache = value_cache.clone() + # Using default kv_scale + kv_scale = 1.0 + # Call the reshape_and_cache kernel. cache_ops.reshape_and_cache(key, value, key_cache, value_cache, - slot_mapping, kv_cache_dtype) + slot_mapping, kv_cache_dtype, kv_scale) if kv_cache_dtype == "fp8": result_key_cache = torch.empty_like(key_cache, dtype=torch.float16) From 49df502edecc96e0e73c871f25d444f0c72861cb Mon Sep 17 00:00:00 2001 From: HaiShaw Date: Thu, 29 Feb 2024 14:22:57 -0500 Subject: [PATCH 110/159] Cleanup comments according to reviews --- csrc/quantization/fp8/amd_detail/quant_utils.cuh | 2 -- 1 file changed, 2 deletions(-) diff --git a/csrc/quantization/fp8/amd_detail/quant_utils.cuh b/csrc/quantization/fp8/amd_detail/quant_utils.cuh index d49b67425c95a..5f597407e3b02 100644 --- a/csrc/quantization/fp8/amd_detail/quant_utils.cuh +++ b/csrc/quantization/fp8/amd_detail/quant_utils.cuh @@ -438,8 +438,6 @@ __inline__ __device__ float2 scaled_vec_conversion(const uint1 #if defined(__HIP__MI300__) && defined(__HIP_FP8_EXPERIMENTAL_BULK_CONVERT__) float2 res; const auto& f2 = __builtin_amdgcn_cvt_pk_f32_fp8(a, 0); - //res.x = vec_conversion(static_cast(a)); - //res.y = vec_conversion(static_cast(a >> 8U)); res.x = f2[0] * scale; res.y = f2[1] * scale; return res; From 12f76508ffec611887736d83bc904b1c61039125 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Mon, 4 Mar 2024 19:55:49 +0000 Subject: [PATCH 111/159] Remove load_dummy_kv_cache_scales as convention change in PR#9 renders it superfluous --- vllm/model_executor/models/llama.py | 10 ---------- vllm/worker/model_runner.py | 6 +----- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 3c10f3600dbca..808e5b53aa110 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -366,10 +366,6 @@ def load_weights(self, # factors (or else raise an exception). Thus, handled exceptions should # make sure to leave KV cache scale factors in a known good (dummy) state def load_kv_cache_scales(self, filename: str) -> None: - # Initialize KV cache scales to dummy values first. These will be - # overwritten by the actual values if and only if the later loading - # process completes without error - self.load_dummy_kv_cache_scales() tp_size = get_tensor_model_parallel_world_size() tp_rank = get_tensor_model_parallel_rank() for layer_idx, scaling_factor in kv_cache_scales_loader( @@ -383,9 +379,3 @@ def load_kv_cache_scales(self, filename: str) -> None: # scaling_factor = tensor_amax / FPtype_max scaling_factor *= 2 layer_paged_attn.kv_cache_scaling_factor = scaling_factor - - # Should not be called unless the KV cache dtype is FP8 on ROCm (AMD GPU) - def load_dummy_kv_cache_scales(self) -> None: - for layer_idx in range(self.model.config.num_hidden_layers): - layer_paged_attn = self.model.layers[layer_idx].self_attn.attn - setattr(layer_paged_attn, "kv_cache_scaling_factor", 1.0) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 361ed163dba3c..772106dc3ee6a 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -92,13 +92,9 @@ def load_model(self) -> None: raise RuntimeError("Using FP8 KV cache and scaling factors provided but " f"model {self.model.__class__} does not support loading " "scaling factors.") - elif callable(getattr(self.model, "load_dummy_kv_cache_scales", None)): + else: logger.warn(f"Using FP8 KV cache but no scaling factors provided. Defaulting to " "scaling factors of 1.0, This may lead to less accurate results!") - self.model.load_dummy_kv_cache_scales() - else: - raise RuntimeError(f"Using FP8 KV cache but no scaling factors provided and model " - "does not support loading dummy KV cache scaling factors.") elif self.model_config.kv_cache_scales_path is not None: logger.warn("KV cache scaling factors provided, but the KV cache data type is not FP8. " "KV cache scaling factors will not be used.") From 4a8d06ce2c9bb5bba99e13923443379ba27c3720 Mon Sep 17 00:00:00 2001 From: AdrianAbeyta Date: Tue, 5 Mar 2024 21:47:50 +0000 Subject: [PATCH 112/159] Add back removal of gather cached kv kernel for use with FP8 --- csrc/cache.h | 7 ++ csrc/cache_kernels.cu | 161 ++++++++++++++++++++++++++++++++++++++++++ setup.py | 14 ++++ 3 files changed, 182 insertions(+) diff --git a/csrc/cache.h b/csrc/cache.h index 718a5f6cfd7f7..82b90eb4ab631 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -24,6 +24,13 @@ void reshape_and_cache( const std::string& kv_cache_dtype, const float kv_scale); +void gather_cached_kv( + torch::Tensor& key, + torch::Tensor& value, + torch::Tensor& key_cache, + torch::Tensor& value_cache, + torch::Tensor& slot_mapping); + // Just for unittest void convert_fp8( torch::Tensor& src_cache, diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 24aaa2ff3e263..73f61e92b1a51 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -277,6 +277,167 @@ void reshape_and_cache( namespace vllm { +// Grid: (num_blocks, block_size). +template +__global__ void gather_cached_kv_kernel( + scalar_t* __restrict__ key, // [num_tokens, [stride], num_heads, head_size] + scalar_t* __restrict__ value, // [num_tokens, [stride], num_heads, head_size] + const scalar_t* __restrict__ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] + const scalar_t* __restrict__ value_cache, // [num_blocks, num_heads, head_size, block_size] + const int* __restrict__ slot_mapping, // [num_tokens] + const int key_stride, + const int value_stride, + const int num_heads, + const int head_size, + const int block_size, + const int x) { + const int token_idx = blockIdx.x; + const int slot_idx = slot_mapping[token_idx]; + const int block_idx = slot_idx / block_size; + const int block_offset = slot_idx % block_size; + + const int num_tokens = num_heads * head_size; + for (int i = threadIdx.x; i < num_tokens; i += blockDim.x) { + const int tgt_key_idx = token_idx * key_stride + i; + const int tgt_value_idx = token_idx * value_stride + i; + + const int head_idx = i / head_size; + const int head_offset = i % head_size; + const int x_idx = head_offset / x; // the offset of the [head_size/x] dimension + const int x_offset = head_offset % x; + + const int src_key_idx = block_idx * num_heads * (head_size / x) * block_size * x + + head_idx * (head_size / x) * block_size * x + + x_idx * block_size * x + + block_offset * x + + x_offset; + const int src_value_idx = block_idx * num_heads * head_size * block_size + + head_idx * head_size * block_size + + head_offset * block_size + + block_offset; + + key[tgt_key_idx] = VLLM_LDG(&key_cache[src_key_idx]); + value[tgt_value_idx] = VLLM_LDG(&value_cache[src_value_idx]); + } +} + +template +__global__ void gather_cached_kv_kernel_optimized( + scalar_t *__restrict__ key, // [num_tokens, [stride], num_heads, head_size] + scalar_t *__restrict__ value, // [num_tokens, [stride], num_heads, head_size] + const scalar_t *__restrict__ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] + const scalar_t *__restrict__ value_cache, // [num_blocks, num_heads, head_size, block_size] + const int *__restrict__ slot_mapping, // [num_tokens] + const int key_stride, + const int value_stride, + const int num_heads, + const int head_size, + const int block_size, + const int x) +{ + const int token_idx = blockIdx.x; + const int slot_idx = slot_mapping[token_idx]; + const int block_idx = slot_idx / block_size; + const int block_offset = slot_idx % block_size; + + const int dim = num_heads * head_size; + assert(dim % 4 == 0); // this is true for known use cases + const int unroll_factor = 4; + const int unrolled_dim = dim / unroll_factor; + + for (int i = threadIdx.x; i < unrolled_dim; i += blockDim.x) + { + int tgt_key_indices[unroll_factor]; + int tgt_value_indices[unroll_factor]; + int src_key_indices[unroll_factor]; + int src_value_indices[unroll_factor]; + scalar_t keys_to_store[unroll_factor]; + scalar_t values_to_store[unroll_factor]; + + #pragma unroll + for (int j = 0; j < unroll_factor; ++j) + { + int index = i + j * unrolled_dim; + + const int tgt_key_idx = token_idx * key_stride + index; + const int tgt_value_idx = token_idx * value_stride + index; + + const int head_idx = index / head_size; + const int head_offset = index % head_size; + const int x_idx = head_offset / x; + const int x_offset = head_offset % x; + + const int src_key_idx = block_idx * num_heads * (head_size / x) * block_size * x + + head_idx * (head_size / x) * block_size * x + + x_idx * block_size * x + + block_offset * x + + x_offset; + const int src_value_idx = block_idx * num_heads * head_size * block_size + + head_idx * head_size * block_size + + head_offset * block_size + + block_offset; + + tgt_key_indices[j] = tgt_key_idx; + tgt_value_indices[j] = tgt_value_idx; + src_key_indices[j] = src_key_idx; + src_value_indices[j] = src_value_idx; + + keys_to_store[j] = VLLM_LDG(&key_cache[src_key_idx]); + values_to_store[j] = VLLM_LDG(&value_cache[src_value_idx]); + } + + #pragma unroll + for (int j = 0; j < unroll_factor; ++j) + { + key[tgt_key_indices[j]] = keys_to_store[j]; + value[tgt_value_indices[j]] = values_to_store[j]; + } + } +} + +} // namespace vllm + +void gather_cached_kv( + torch::Tensor& key, // [out] [num_tokens, num_heads, head_size] + torch::Tensor& value, // [out] [num_tokens, num_heads, head_size] + torch::Tensor& key_cache, // [in] [num_blocks, num_heads, head_size/x, block_size, x] + torch::Tensor& value_cache, // [in] [num_blocks, num_heads, head_size, block_size] + torch::Tensor& slot_mapping) // [in] [num_tokens] +{ + int num_tokens = key.size(0); + int num_heads = key.size(1); + int head_size = key.size(2); + int block_size = key_cache.size(3); + int x = key_cache.size(4); + + int key_stride = key.stride(0); + int value_stride = value.stride(0); + + dim3 grid(num_tokens); + dim3 block(std::min(num_heads * head_size, 512)); + const at::cuda::OptionalCUDAGuard device_guard(device_of(key)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES( + key.scalar_type(), + "gather_cached_kv_kernel_optimized", + [&] { + vllm::gather_cached_kv_kernel_optimized<<>>( + key.data_ptr(), + value.data_ptr(), + key_cache.data_ptr(), + value_cache.data_ptr(), + slot_mapping.data_ptr(), + key_stride, + value_stride, + num_heads, + head_size, + block_size, + x); + }); +} + +namespace vllm { + template __global__ void convert_fp8_kernel( const Tin* __restrict__ src_cache, diff --git a/setup.py b/setup.py index 78e93cae920ef..c33a4d0256a2c 100644 --- a/setup.py +++ b/setup.py @@ -67,6 +67,20 @@ def _is_cuda() -> bool: CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] +def get_amdgpu_offload_arch(): + command = "/opt/rocm/llvm/bin/amdgpu-offload-arch" + try: + output = subprocess.check_output([command]) + return output.decode('utf-8').strip() + except subprocess.CalledProcessError as e: + error_message = f"Error: {e}" + raise RuntimeError(error_message) from e + except FileNotFoundError as e: + # If the command is not found, print an error message + error_message = f"The command {command} was not found." + raise RuntimeError(error_message) from e + + return None def get_hipcc_rocm_version(): # Run the hipcc --version command From b87aec132484a3ae1fb76e6c2fbe91fc7934cf7e Mon Sep 17 00:00:00 2001 From: AdrianAbeyta Date: Tue, 5 Mar 2024 22:27:58 +0000 Subject: [PATCH 113/159] Clean up IFU --- vllm/utils.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vllm/utils.py b/vllm/utils.py index 46151b7e7f742..8b17562fea258 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -273,11 +273,7 @@ def create_kv_caches_with_random( raise ValueError(f"Invalid model dtype: {model_dtype}") elif cache_dtype in ["half", "bfloat16", "float"]: torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype] -<<<<<<< HEAD - elif cache_dtype in ["fp8_e5m2", "fp8_e4m3"]: -======= elif cache_dtype == "fp8": ->>>>>>> Rename remaining fp8_e5m2 to general fp8 torch_dtype = torch.uint8 else: raise ValueError(f"Invalid kv cache dtype: {cache_dtype}") From fa6fbce91ed358cc9ccfd2c86dcc79b0ea96a245 Mon Sep 17 00:00:00 2001 From: AdrianAbeyta Date: Tue, 5 Mar 2024 23:21:59 +0000 Subject: [PATCH 114/159] Clean up IFU --- vllm/utils.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/vllm/utils.py b/vllm/utils.py index 8b17562fea258..5de2990e9dd27 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -28,12 +28,7 @@ "half": torch.half, "bfloat16": torch.bfloat16, "float": torch.float, -<<<<<<< HEAD - "fp8_e5m2": torch.uint8, - "fp8_e4m3": torch.uint8, -======= "fp8": torch.uint8, ->>>>>>> Rename remaining fp8_e5m2 to general fp8 } From 65f70d722782fc2840a071d1c8877e4027a75f18 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Wed, 6 Mar 2024 00:18:24 +0000 Subject: [PATCH 115/159] Schema change: preliminary changes to extract script, TODO: loading logic refactor --- 3rdparty/quantizer/extract_scales.py | 125 +++++++++++++++++++++------ vllm/model_executor/weight_utils.py | 2 +- 2 files changed, 99 insertions(+), 28 deletions(-) diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py index ee5e7d3068610..eb8d32507d191 100644 --- a/3rdparty/quantizer/extract_scales.py +++ b/3rdparty/quantizer/extract_scales.py @@ -7,7 +7,7 @@ import os from safetensors.torch import safe_open import torch -from typing import List, Optional, Tuple +from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple # Adapted from vllm/model_executor/weight_utils.py @@ -90,12 +90,11 @@ def _hf_tensorfile_iterator(filename: str, load_format: str, torch.cuda.empty_cache() -def main(args): - rank_tensors_map = {} - hf_tensor_files, use_safetensors = _prepare_hf_weights(args.quantized_model, args.load_format) - # Matches the number immediately after this keyword in the tensor filename to - # determine the TP rank corresponding to said tensor file - rank_keyword = "rank" +def _kv_scales_extractor(hf_tensor_files: Iterator[str], + use_safetensors: bool, + rank_keyword: str = "rank", + expected_tp_size: Optional[int] = None,) -> Dict[int, Dict[int, float]]: + rank_scales_map = {} for tensor_file in hf_tensor_files: try: rank_idx = tensor_file.find(rank_keyword) @@ -118,9 +117,9 @@ def main(args): f"corresponding to file '{tensor_file}'") raise - if rank not in rank_tensors_map: + if rank not in rank_scales_map: layer_scales_map = {} - rank_tensors_map[rank] = layer_scales_map + rank_scales_map[rank] = layer_scales_map else: raise RuntimeError(f"Tensor file '{tensor_file}' shares TP rank {rank} " "with another tensor file.") @@ -138,10 +137,93 @@ def main(args): layer_scales_map[layer_idx] = param.item() except RuntimeError: print("This utility supports only per-tensor scalar scale factors " - f"for now. The tensor\n {name} = {param} is an invalid " + f"for now. The tensor\n {name} = {param} \nis an invalid " "scale factor.") raise + if all(len(layer_scales_map) == 0 for layer_scales_map in rank_scales_map.values()): + # Note: this is true even if the rank_scales_map is empty + print("WARNING: No KV cache scale factors found. No output saved.") + return None + empirical_tp_world_size = max(rank_scales_map.keys()) + 1 + if expected_tp_size is not None: + assert expected_tp_size == empirical_tp_world_size, "User expected TP world size = " \ + f"{expected_tp_size} from model but tool is expecting TP world size = " \ + f"{empirical_tp_world_size} from model instead." + for i in range(empirical_tp_world_size): + assert i in rank_scales_map, f"Expected TP world size = {empirical_tp_world_size} " \ + "but did not find KV cache scaling factors " \ + f"for TP rank {i}" + print(f"Found TP world size = {empirical_tp_world_size} when extracting KV cache scales!") + return rank_scales_map + + +def _metadata_extractor(quantized_model_dir: str, + metadata_from_schema: Dict[str, Callable[[Dict[str, Any]], Any]]) -> Dict[str, Any]: + if not os.path.isdir(quantized_model_dir): + raise FileNotFoundError(f"The quantized model directory `{quantized_model_dir}` " + "does not exist.") + allow_patterns = [ "*.json" ] + + metadata_files: List[str] = [] + for pattern in allow_patterns: + metadata_files += glob.glob(os.path.join(quantized_model_dir, pattern)) + + result = {} + for file in metadata_files: + with open(file) as f: + try: + schema = json.load(f) + for metadata, from_schema_fn in metadata_from_schema.items(): + if metadata not in result: + result[metadata] = from_schema_fn(schema) + + except json.JSONDecodeError: + pass + except ValueError: + pass + return result + + +def main(args): + metadata_from_schema = { + "model_type": lambda schema: schema["layers"][0]["decoder_type"], + "tp_size": lambda schema: int(schema["tensor_parallel"]), + "model_dtype": lambda schema: schema["dtype"] + } + metadata_dict = _metadata_extractor(args.quantized_model, metadata_from_schema) + model_dtype = metadata_dict["model_dtype"] + + hf_tensor_files, use_safetensors = _prepare_hf_weights(args.quantized_model, args.load_format) + # Matches the number immediately after this keyword in the tensor filename to + # determine the TP rank corresponding to said tensor file + rank_keyword = "rank" + rank_scales_map = _kv_scales_extractor(hf_tensor_files, use_safetensors, + rank_keyword, args.tp_size) + # Postprocess: formatting to the current schema. Consider pulling it out into a dedicated + # function should it ever become more complicated. + rank_scales_map = { rank_keyword + str(rank) : scale + for rank, scale in rank_scales_map.items() } + + # Consider unifying and formalizing this into its own class (and other necessary subclasses) in + # the future + schema = { "model_type": metadata_dict["model_type"], + "kv_cache": { + "dtype": "fp8" if rank_scales_map else model_dtype, + "scaling_factor": rank_scales_map + }, + # The fields below this comment are not used or checked for now + # but will be in the future + "activation": { + "dtype": model_dtype, + "scaling_factor": None, + }, + "weight": { + "dtype": model_dtype, + "scaling_factor": None + } + } + if args.output_dir is None: output_file = os.path.join(args.quantized_model, args.output_name) else: @@ -149,23 +231,12 @@ def main(args): if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir, exist_ok=True) - if all(len(layer_scales_map) == 0 for layer_scales_map in rank_tensors_map.values()): - # Note: this is true even if the rank_tensors_map is empty - print("WARNING: No KV cache scale factors found. No output saved.") - else: - empirical_tp_world_size = max(rank_tensors_map.keys()) + 1 - if args.tp_size is not None: - assert args.tp_size == empirical_tp_world_size, "User expected TP world size = " \ - f"{args.tp_size} from model but tool is expecting TP world size = " \ - f"{empirical_tp_world_size} from model instead." - for i in range(empirical_tp_world_size): - assert i in rank_tensors_map, f"Expected TP world size = {empirical_tp_world_size} " \ - "but did not find KV cache scaling factors " \ - f"for TP rank {i}" - with open(output_file, 'w') as f: - json.dump(rank_tensors_map, f, sort_keys=True, indent=4) - print(f"Completed! Found TP world size = {empirical_tp_world_size}.", - f"KV cache scaling factors saved to {output_file}") + + with open(output_file, 'w') as f: + pass + #json.dump(rank_scales_map, f, sort_keys=True, indent=4) + #print(f"Completed! Found TP world size = {empirical_tp_world_size}.", + # f"KV cache scaling factors saved to {output_file}") if __name__ == "__main__": diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index 44515916db685..8c5624b765b06 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -316,7 +316,7 @@ def kv_cache_scales_iterator(filename: str, logger.error(f"An error occurred while reading '{filename}': {e}") # This section is only reached if any of the excepts are hit # Return an empty iterator (tuple) => no KV cache scales are loaded - # which effectively defaults to 1.0 scales + # which defaults to 1.0 scales logger.warn(f"Defaulting to KV cache scaling factors = 1.0 for all layers in TP rank {tp_rank}" " as an error occurred during loading.") return () From f5c0236f242f3d70f52df9432bf5fc6cff22c5fb Mon Sep 17 00:00:00 2001 From: AdrianAbeyta Date: Wed, 6 Mar 2024 22:37:39 +0000 Subject: [PATCH 116/159] Fix runtime issues with upstream rebase --- benchmarks/benchmark_throughput.py | 20 +++++++++++--------- vllm/config.py | 2 +- vllm/engine/arg_utils.py | 2 +- vllm/engine/llm_engine.py | 3 +-- vllm/worker/cache_engine.py | 1 - 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index bc095bbf1300c..b6578b99cbaef 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -72,8 +72,8 @@ def run_vllm( max_model_len: Optional[int], enforce_eager: bool, kv_cache_dtype: str, - device: str, kv_cache_scales_path: Optional[str], + device: str, ) -> float: from vllm import LLM, SamplingParams llm = LLM( @@ -87,8 +87,8 @@ def run_vllm( max_model_len=max_model_len, enforce_eager=enforce_eager, kv_cache_dtype=kv_cache_dtype, - device=device, kv_cache_scales_path=kv_cache_scales_path, + device=device, ) # Add the requests to the engine. @@ -213,7 +213,7 @@ def main(args: argparse.Namespace): args.seed, args.n, args.use_beam_search, args.trust_remote_code, args.dtype, args.max_model_len, args.enforce_eager, - args.kv_cache_dtype, args.kv_cache_scales_path, args.device) + args.kv_cache_dtype,args.kv_cache_scales_path,args.device) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -299,12 +299,6 @@ def main(args: argparse.Namespace): help='Data type for kv cache storage. If "auto", will use model data type. ' 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') - parser.add_argument( - "--device", - type=str, - default="cuda", - choices=["cuda"], - help='device type for vLLM execution, supporting CUDA only currently.') parser.add_argument( '--kv-cache-scales-path', type=str, @@ -314,7 +308,15 @@ def main(args: argparse.Namespace): 'KV cache scaling factors default to 1.0, which may cause accuracy issues. ' 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') + parser.add_argument( + "--device", + type=str, + default="cuda", + choices=["cuda"], + help='device type for vLLM execution, supporting CUDA only currently.') + args = parser.parse_args() + if args.tokenizer is None: args.tokenizer = args.model if args.dataset is None: diff --git a/vllm/config.py b/vllm/config.py index e9f0730941055..9d66dde356be2 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -92,6 +92,7 @@ def __init__( self.download_dir = download_dir self.kv_cache_scales_path = kv_cache_scales_path self.load_format = load_format + self.dtype = dtype self.seed = seed self.revision = revision self.code_revision = code_revision @@ -99,7 +100,6 @@ def __init__( self.quantization = quantization self.enforce_eager = enforce_eager self.max_context_len_to_capture = max_context_len_to_capture - if os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true": # download model from ModelScope hub, # lazy import so that modelscope is not required for normal use. diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 5a94fe3a876b4..c5ab7da545f6c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -297,7 +297,7 @@ def create_engine_configs( model_config = ModelConfig(self.model, self.tokenizer, self.tokenizer_mode, self.trust_remote_code, self.download_dir, self.kv_cache_scales_path, self.load_format, - self.dtype, self.seed, self.revision, + self.dtype, self.seed, self.revision, self.code_revision, self.tokenizer_revision, self.max_model_len, self.quantization, self.enforce_eager, self.max_context_len_to_capture) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 8d74088b9af07..8ab8f6221e77a 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -114,10 +114,9 @@ def __init__( self.device_config = device_config self.log_stats = log_stats self._verify_args() - self._init_tokenizer() self.seq_counter = Counter() - + # Create the parallel GPU workers. if self.parallel_config.worker_use_ray: # Disable Ray usage stats collection. diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 2ceab308a9e21..3adf628394e7b 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -7,7 +7,6 @@ from vllm.config import CacheConfig, ModelConfig, ParallelConfig from vllm.logger import init_logger from vllm.utils import in_wsl, is_neuron, STR_DTYPE_TO_TORCH_DTYPE -from vllm.model_executor.weight_utils import kv_cache_scales_iterator from vllm.utils import in_wsl, STR_DTYPE_TO_TORCH_DTYPE logger = init_logger(__name__) From d2a42f9a86963d69f6067ec9467411ad001f0bab Mon Sep 17 00:00:00 2001 From: ttbachyinsda Date: Tue, 5 Mar 2024 03:17:12 +0800 Subject: [PATCH 117/159] [Minor fix] The domain dns.google may cause a socket.gaierror exception (#3176) Co-authored-by: guofangze --- vllm/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/utils.py b/vllm/utils.py index 5de2990e9dd27..07d29bd8fe303 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -173,7 +173,7 @@ def get_ip() -> str: # try ipv4 s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) try: - s.connect(("dns.google", 80)) # Doesn't need to be reachable + s.connect(("8.8.8.8", 80)) # Doesn't need to be reachable return s.getsockname()[0] except OSError: # try ipv6 From 4b3e4b0f6157b3b885ed068754b389f91160882f Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Thu, 7 Mar 2024 04:57:24 +0000 Subject: [PATCH 118/159] Preliminary refactoring: KV cache scales JSON into general scales JSON schema --- 3rdparty/quantizer/extract_scales.py | 124 +++++++++++++++++++-------- benchmarks/benchmark_throughput.py | 8 +- vllm/config.py | 13 +-- vllm/engine/arg_utils.py | 8 +- vllm/model_executor/models/llama.py | 28 +++--- vllm/model_executor/weight_utils.py | 64 ++++++++------ vllm/worker/model_runner.py | 12 +-- 7 files changed, 158 insertions(+), 99 deletions(-) diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py index eb8d32507d191..cd93fb7b5abdf 100644 --- a/3rdparty/quantizer/extract_scales.py +++ b/3rdparty/quantizer/extract_scales.py @@ -91,9 +91,23 @@ def _hf_tensorfile_iterator(filename: str, load_format: str, def _kv_scales_extractor(hf_tensor_files: Iterator[str], - use_safetensors: bool, - rank_keyword: str = "rank", - expected_tp_size: Optional[int] = None,) -> Dict[int, Dict[int, float]]: + use_safetensors: bool, + rank_keyword: str = "rank", + expected_tp_size: Optional[int] = None) -> Dict[int, Dict[int, float]]: + """ + Given a list of files containing tensor data, attempt to extract KV cache scales from + these files. Intended as a helper function taking in the output from _prepare_hf_weights. + Args: + rank_keyword Matches the number immediately after this keyword in the tensor + filename to determine the TP rank corresponding to said tensor file + expected_tp_size If specified, the TP size of the tensor files is checked against + this and an error is raised if they do not match. + Returns a dictionary mapping TP ranks to their relevant KV cache scaling factors. The + per-rank scaling factors are themselves represented as a dictionary of layer indices to the + respective per-layer scaling factor. + """ + for char in rank_keyword: + assert not char.isdecimal(), f"Rank keyword {rank_keyword} contains a numeric character!" rank_scales_map = {} for tensor_file in hf_tensor_files: try: @@ -159,57 +173,98 @@ def _kv_scales_extractor(hf_tensor_files: Iterator[str], def _metadata_extractor(quantized_model_dir: str, - metadata_from_schema: Dict[str, Callable[[Dict[str, Any]], Any]]) -> Dict[str, Any]: + metadata_extract_fns: Dict[str, Callable[[Dict[str, Any]], Any]]) \ + -> Dict[str, Any]: + """ + Given the quantized model directory, tries to extract metadata from the JSON files in this + directory. It is assumed that each JSON file corresponds to a JSON-serialization of some + dictionary (the "JSON-dictionary"). The metadata fields to be extracted and how to extract + it is specified in metadata_extract_fns, which is a dictionary mapping metadata field names + to extraction functions. + Extraction functions should take in a JSON-dictionary as the sole argument and return the + metadata corresponding to that function. The extraction function is allowed to raise + exceptions, but the special exceptions KeyError or ValueError must be raised if and only if + the metadata field cannot be extracted from the current JSON-dictionary yet it remains + possible that said metadata field can be found in another JSON-dictionary. + Returns a dictionary mapping metadata fields to their extracted data. The dictionary's keys + are exactly the same as those in `metadata_extract_fns`: if any fields could not be extracted, + their value is set to None and a warning is printed. + """ if not os.path.isdir(quantized_model_dir): raise FileNotFoundError(f"The quantized model directory `{quantized_model_dir}` " "does not exist.") - allow_patterns = [ "*.json" ] - - metadata_files: List[str] = [] - for pattern in allow_patterns: - metadata_files += glob.glob(os.path.join(quantized_model_dir, pattern)) + metadata_files = glob.glob(os.path.join(quantized_model_dir, "*.json")) result = {} for file in metadata_files: with open(file) as f: try: - schema = json.load(f) - for metadata, from_schema_fn in metadata_from_schema.items(): - if metadata not in result: - result[metadata] = from_schema_fn(schema) - + metadata = json.load(f) except json.JSONDecodeError: - pass - except ValueError: - pass + print(f"Could not parse `{file}` as a valid metadata file, skipping it.") + continue + if not isinstance(metadata, dict): + print(f"The file `{file}` does not correspond to a JSON-serialized " + "dictionary, skipping it.") + continue + for metadata_name, extract_fn in metadata_extract_fns.items(): + try: + metadata_info = extract_fn(metadata) + if metadata_name not in result: + result[metadata_name] = metadata_info + elif metadata_info != result[metadata_name]: + raise RuntimeError("Metadata mismatch! Originally found " + f"{metadata_name} = {result[metadata_name]} but " + f"now found {metadata_name} = {metadata_info} in " + f"`{file}`") + except KeyError: + # It is possible that a given file does not contain some of our selected + # metadata as it could be located in some other metadata file. + # 'EFINAE': extract_fn failure is not an error. + pass + except ValueError: + # See above. + pass + + # Warn if we cannot find any of the requested metadata + for metadata_name in metadata_extract_fns: + if metadata_name not in result: + print(f"Unable to find requested metadata field `{metadata_name}`!") + result[metadata_name] = None + return result def main(args): - metadata_from_schema = { - "model_type": lambda schema: schema["layers"][0]["decoder_type"], - "tp_size": lambda schema: int(schema["tensor_parallel"]), - "model_dtype": lambda schema: schema["dtype"] + metadata_extract_fns = { + "model_type": lambda json_dict: json_dict["layers"][0]["decoder_type"], + "tp_size": lambda json_dict: int(json_dict["tensor_parallel"]), + "model_dtype": lambda json_dict: json_dict["dtype"] } - metadata_dict = _metadata_extractor(args.quantized_model, metadata_from_schema) - model_dtype = metadata_dict["model_dtype"] + metadata_dict = _metadata_extractor(args.quantized_model, metadata_extract_fns) hf_tensor_files, use_safetensors = _prepare_hf_weights(args.quantized_model, args.load_format) - # Matches the number immediately after this keyword in the tensor filename to - # determine the TP rank corresponding to said tensor file + if args.tp_size is not None: + metadata_tp_size = metadata_dict["tp_size"] + if metadata_tp_size is not None: + assert args.tp_size == metadata_tp_size, "User expected TP world size = " \ + f"{args.tp_size} but found TP world size = {metadata_tp_size}!" + expected_tp_size = args.tp_size or metadata_dict["tp_size"] rank_keyword = "rank" rank_scales_map = _kv_scales_extractor(hf_tensor_files, use_safetensors, - rank_keyword, args.tp_size) + rank_keyword, expected_tp_size) # Postprocess: formatting to the current schema. Consider pulling it out into a dedicated # function should it ever become more complicated. - rank_scales_map = { rank_keyword + str(rank) : scale + rank_scales_map = { rank_keyword + str(rank) : + { k: scale[k] for k in sorted(scale.keys())} for rank, scale in rank_scales_map.items() } + model_dtype = metadata_dict["model_dtype"] # Consider unifying and formalizing this into its own class (and other necessary subclasses) in # the future schema = { "model_type": metadata_dict["model_type"], "kv_cache": { - "dtype": "fp8" if rank_scales_map else model_dtype, + "dtype": "fp8" if len(rank_scales_map) > 0 else model_dtype, "scaling_factor": rank_scales_map }, # The fields below this comment are not used or checked for now @@ -227,16 +282,13 @@ def main(args): if args.output_dir is None: output_file = os.path.join(args.quantized_model, args.output_name) else: - output_file = os.path.join(args.output_dir, args.output_name) if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir, exist_ok=True) - - + output_file = os.path.join(args.output_dir, args.output_name) + with open(output_file, 'w') as f: - pass - #json.dump(rank_scales_map, f, sort_keys=True, indent=4) - #print(f"Completed! Found TP world size = {empirical_tp_world_size}.", - # f"KV cache scaling factors saved to {output_file}") + json.dump(schema, f, indent=4) + print(f"Completed! KV cache scaling factors saved to {output_file}") if __name__ == "__main__": @@ -245,7 +297,7 @@ def main(args): "and saves them to a JSON file compatible with later " "use by vLLM (pass this file to the appropriate " "runtime typically using the argument " - "--kv_cache_scales_path ). This is only used " + "--scales_path ). This is only used " "if the KV cache dtype is FP8 and on ROCm (AMD GPU).") parser.add_argument("--quantized_model", help="Specify the directory containing a single quantized HF model. " diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index ae37e356cda9a..03e624fa91cf4 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -72,7 +72,7 @@ def run_vllm( max_model_len: Optional[int], enforce_eager: bool, kv_cache_dtype: str, - kv_cache_scales_path: Optional[str], + scales_path: Optional[str], ) -> float: from vllm import LLM, SamplingParams llm = LLM( @@ -86,7 +86,7 @@ def run_vllm( max_model_len=max_model_len, enforce_eager=enforce_eager, kv_cache_dtype=kv_cache_dtype, - kv_cache_scales_path=kv_cache_scales_path, + scales_path=scales_path, ) # Add the requests to the engine. @@ -211,7 +211,7 @@ def main(args: argparse.Namespace): args.seed, args.n, args.use_beam_search, args.trust_remote_code, args.dtype, args.max_model_len, args.enforce_eager, - args.kv_cache_dtype, args.kv_cache_scales_path) + args.kv_cache_dtype, args.scales_path) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -298,7 +298,7 @@ def main(args: argparse.Namespace): 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') parser.add_argument( - '--kv-cache-scales-path', + '--scales-path', type=str, default=None, help='Path to the JSON files containing the KV cache scaling factors. ' diff --git a/vllm/config.py b/vllm/config.py index bb0308d9add16..d21add66105b9 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -27,10 +27,11 @@ class ModelConfig: downloading the model and tokenizer. download_dir: Directory to download and load the weights, default to the default cache directory of huggingface. - kv_cache_scales_path: Path to files containing JSON serialization of a map - of layer indices to their respective KV cache scaling factors. Used to - load aforementioned scaling factors into the model when KV cache type - is FP8_E4M3 on ROCm (AMD GPU). + scales_path: Path to file containing scaling factors. Used to load + KV cache scaling factors into the model when KV cache type + is FP8_E4M3 on ROCm (AMD GPU). In the future these will also be used + to load activation and weight scaling factors when the model dtype is + FP8_E4M3 on ROCm. load_format: The format of the model weights to load: "auto" will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is @@ -70,7 +71,7 @@ def __init__( tokenizer_mode: str, trust_remote_code: bool, download_dir: Optional[str], - kv_cache_scales_path: Optional[str], + scales_path: Optional[str], load_format: str, dtype: Union[str, torch.dtype], seed: int, @@ -86,7 +87,7 @@ def __init__( self.tokenizer_mode = tokenizer_mode self.trust_remote_code = trust_remote_code self.download_dir = download_dir - self.kv_cache_scales_path = kv_cache_scales_path + self.scales_path = scales_path self.load_format = load_format self.seed = seed self.revision = revision diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index df75405269662..64fb58f4ace7e 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -18,7 +18,7 @@ class EngineArgs: load_format: str = 'auto' dtype: str = 'auto' kv_cache_dtype: str = 'auto' - kv_cache_scales_path: str = None + scales_path: str = None seed: int = 0 max_model_len: Optional[int] = None worker_use_ray: bool = False @@ -133,10 +133,10 @@ def add_cli_args( 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') parser.add_argument( - '--kv-cache-scales-path', + '--scales-path', type=str, default=None, - help='Path to the JSON files containing the KV cache scaling factors. ' + help='Path to the JSON file containing scaling factors. ' 'This should generally be supplied, when KV cache dtype is FP8. Otherwise, ' 'KV cache scaling factors default to 1.0, which may cause accuracy issues. ' 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' @@ -281,7 +281,7 @@ def create_engine_configs( Optional[LoRAConfig]]: model_config = ModelConfig(self.model, self.tokenizer, self.tokenizer_mode, self.trust_remote_code, - self.download_dir, self.kv_cache_scales_path, self.load_format, + self.download_dir, self.scales_path, self.load_format, self.dtype, self.seed, self.revision, self.tokenizer_revision, self.max_model_len, self.quantization, self.enforce_eager, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 12ef27bd7b45c..42c0b0f1a1cf6 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -45,9 +45,10 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.weight_utils import (default_weight_loader, hf_model_weights_iterator, - kv_cache_scales_iterator) + kv_cache_scales_loader) from vllm.sequence import SamplerOutput from vllm.config import LoRAConfig +from vllm.utils import is_hip KVCache = Tuple[torch.Tensor, torch.Tensor] @@ -364,21 +365,18 @@ def load_weights(self, # If this function is called, it should always initialize KV cache scale # factors (or else raise an exception). Thus, handled exceptions should # make sure to leave KV cache scale factors in a known good (dummy) state - def load_kv_cache_scales(self, filename: str) -> None: - # Initialize KV cache scales to dummy values first. These will be - # overwritten by the actual values if and only if the later loading - # process completes without error - self.load_dummy_kv_cache_scales() + def load_kv_cache_scales(self, scales_path: str) -> None: tp_size = get_tensor_model_parallel_world_size() tp_rank = get_tensor_model_parallel_rank() - for layer_idx, scaling_factor in kv_cache_scales_iterator( - filename, tp_rank, tp_size, - self.model.config.num_hidden_layers): + for layer_idx, scaling_factor in kv_cache_scales_loader( + scales_path, tp_rank, tp_size, + self.config.num_hidden_layers, + self.config.__class__.model_type): layer_paged_attn = self.model.layers[layer_idx].self_attn.attn + if is_hip(): + # The scaling factor convention we are assuming is + # quantized_value * scaling_factor ~= true_value + # which is consistent with the practice of setting + # scaling_factor = tensor_amax / FPtype_max + scaling_factor *= 2 layer_paged_attn.kv_cache_scaling_factor = scaling_factor - - # Should not be called unless the KV cache dtype is FP8 on ROCm (AMD GPU) - def load_dummy_kv_cache_scales(self) -> None: - for layer_idx in range(self.model.config.num_hidden_layers): - layer_paged_attn = self.model.layers[layer_idx].self_attn.attn - setattr(layer_paged_attn, "kv_cache_scaling_factor", 1.0) diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index 8c5624b765b06..10d97359278dd 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -262,10 +262,12 @@ def hf_model_weights_iterator( torch.cuda.empty_cache() -def kv_cache_scales_iterator(filename: str, - tp_rank: int, - tp_size: int, - num_hidden_layers: int) -> Iterator[Tuple[int, torch.Tensor]]: +def kv_cache_scales_loader(filename: str, + tp_rank: int, + tp_size: int, + num_hidden_layers: int, + model_type: Optional[str], + rank_keyword="rank") -> Iterator[Tuple[int, float]]: """ A simple utility to read in KV cache scaling factors that have been previously serialized to disk. Used by the model to populate the appropriate @@ -277,32 +279,42 @@ def kv_cache_scales_iterator(filename: str, """ try: with open(filename) as f: - # For now we do not obtain any of the benefits of iterators - # but since the number of layers = number of scales is typically - # small, this is not a concern. Loading and processing the entire - # dictionary at once allows us to do sanity checks all at once and - # avoid a situation where we have to abort after having partially - # loaded scaling factors - raw_rank_map = json.load(f, parse_int=int, parse_constant=float) - - # If any of the inputs are malformed, it raises an error somewhere - # in the following lines and is caught in except - assert isinstance(raw_rank_map, dict), "Did not load a dictionary from file." - assert len(raw_rank_map) != 0, "Loaded dictionary is empty." - loaded_tp_size = max(int(rank) for rank in raw_rank_map) + 1 + # Loading and processing the entire dictionary at once allows us + # to do sanity checks all at once and avoid a situation where we + # have to abort after having partially loaded scaling factors + # Since the number of layers is small and (for now) we use scalar + # scaling factors (so the size they use is also small), this is + # not a concern at present. + schema = json.load(f, parse_int=int, parse_constant=float) + + malformed_schema_str = "Malformed schema detected." + # If any of the inputs are malformed or mismatched, it raises an error + # somewhere in the following lines and is caught in except + assert isinstance(schema, dict), malformed_schema_str + if schema["model_type"] is not None: + assert model_type == schema["model_type"], f"Model type is {model_type} but loaded " \ + f"scaling factors belonging to different model type {schema["model_type"]}!" + assert isinstance(schema["kv_cache"], dict), malformed_schema_str + assert schema["kv_cache"]["dtype"] == "fp8", "Loaded scaling factors intended for KV " \ + f"cache dtype = {schema["kv_cache"]["dtype"]} rather than FP8!" + assert isinstance(schema["kv_cache"]["scaling_factor"], dict), malformed_schema_str + rank_scales_map = {int(rank.replace(rank_keyword, "")) : scales_map + for rank, scales_map in schema["kv_cache"]["scaling_factor"].items()} + assert len(rank_scales_map) != 0, "Loaded dictionary is empty." + loaded_tp_size = max(rank_scales_map.keys()) + 1 assert loaded_tp_size == tp_size, f"Loaded dictionary has TP size {loaded_tp_size} " \ f"but LLM engine is currently running with TP size {tp_size}." - for rank, scales_map in raw_rank_map.items(): + for rank, scales_map in rank_scales_map.items(): + assert isinstance(scales_map, dict), malformed_schema_str assert len(scales_map) == num_hidden_layers, "KV cache scales map for TP rank " \ f"{rank} is malformed. Expected {num_hidden_layers} layers, got {len(scales_map)}." for i in range(tp_size): - assert i in raw_rank_map or str(i) in raw_rank_map, "KV cache scales map for TP rank " \ - f"{i} not found." - assert tp_rank in raw_rank_map or str(tp_rank) in raw_rank_map, "Tried to load KV cache " \ - f"scales for TP rank {tp_rank} but these were not found." - raw_layer_scales_map = raw_rank_map.get(tp_rank) or raw_rank_map.get(str(tp_rank)) + assert i in rank_scales_map, f"KV cache scales map for TP rank {i} not found." + assert tp_rank in rank_scales_map, "Tried to load KV cache scales for TP rank " \ + f"{tp_rank} but these were not found." + assert isinstance(rank_scales_map[tp_rank], dict), malformed_schema_str layer_scales_map = {int(layer_idx): float(scale) - for layer_idx, scale in raw_layer_scales_map.items()} + for layer_idx, scale in rank_scales_map[tp_rank].items()} for i in range(num_hidden_layers): assert i in layer_scales_map, "Could not find KV cache scales for layer " \ f"{i} in TP rank {tp_rank}." @@ -314,9 +326,9 @@ def kv_cache_scales_iterator(filename: str, logger.error(f"Error decoding JSON in file '{filename}'.") except Exception as e: logger.error(f"An error occurred while reading '{filename}': {e}") - # This section is only reached if any of the excepts are hit + # This section is reached if and only if any of the excepts are hit # Return an empty iterator (tuple) => no KV cache scales are loaded - # which defaults to 1.0 scales + # which effectively defaults to 1.0 scales logger.warn(f"Defaulting to KV cache scaling factors = 1.0 for all layers in TP rank {tp_rank}" " as an error occurred during loading.") return () diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 361ed163dba3c..53a4a8ad51a0f 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -85,21 +85,17 @@ def load_model(self) -> None: self.model = self.lora_manager.create_lora_manager(self.model) if self.kv_cache_dtype == "fp8": - if self.model_config.kv_cache_scales_path is not None: + if self.model_config.scales_path is not None: if callable(getattr(self.model, "load_kv_cache_scales", None)): - self.model.load_kv_cache_scales(self.model_config.kv_cache_scales_path) + self.model.load_kv_cache_scales(self.model_config.scales_path) else: raise RuntimeError("Using FP8 KV cache and scaling factors provided but " f"model {self.model.__class__} does not support loading " "scaling factors.") - elif callable(getattr(self.model, "load_dummy_kv_cache_scales", None)): + else: logger.warn(f"Using FP8 KV cache but no scaling factors provided. Defaulting to " "scaling factors of 1.0, This may lead to less accurate results!") - self.model.load_dummy_kv_cache_scales() - else: - raise RuntimeError(f"Using FP8 KV cache but no scaling factors provided and model " - "does not support loading dummy KV cache scaling factors.") - elif self.model_config.kv_cache_scales_path is not None: + elif self.model_config.scales_path is not None: logger.warn("KV cache scaling factors provided, but the KV cache data type is not FP8. " "KV cache scaling factors will not be used.") From a7e6e810da7dcf5781c534183061bb8480a3fc18 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Thu, 7 Mar 2024 05:42:54 +0000 Subject: [PATCH 119/159] Fixing stray syntax errors and typos, refactoring rank_keyword detection in loader --- 3rdparty/quantizer/extract_scales.py | 6 +++--- benchmarks/benchmark_latency.py | 6 +++--- benchmarks/benchmark_throughput.py | 2 +- tests/fp8_kv/README.md | 12 +++++------- vllm/config.py | 2 +- vllm/engine/arg_utils.py | 2 +- vllm/engine/llm_engine.py | 2 +- vllm/model_executor/weight_utils.py | 21 ++++++++++++++------- 8 files changed, 29 insertions(+), 24 deletions(-) diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py index cd93fb7b5abdf..7fba41905ce38 100644 --- a/3rdparty/quantizer/extract_scales.py +++ b/3rdparty/quantizer/extract_scales.py @@ -248,7 +248,7 @@ def main(args): metadata_tp_size = metadata_dict["tp_size"] if metadata_tp_size is not None: assert args.tp_size == metadata_tp_size, "User expected TP world size = " \ - f"{args.tp_size} but found TP world size = {metadata_tp_size}!" + f"{args.tp_size} but found TP world size = {metadata_tp_size} from metadata!" expected_tp_size = args.tp_size or metadata_dict["tp_size"] rank_keyword = "rank" rank_scales_map = _kv_scales_extractor(hf_tensor_files, use_safetensors, @@ -256,7 +256,7 @@ def main(args): # Postprocess: formatting to the current schema. Consider pulling it out into a dedicated # function should it ever become more complicated. rank_scales_map = { rank_keyword + str(rank) : - { k: scale[k] for k in sorted(scale.keys())} + {k: scale[k] for k in sorted(scale.keys())} for rank, scale in rank_scales_map.items() } model_dtype = metadata_dict["model_dtype"] @@ -297,7 +297,7 @@ def main(args): "and saves them to a JSON file compatible with later " "use by vLLM (pass this file to the appropriate " "runtime typically using the argument " - "--scales_path ). This is only used " + "--scales-path ). This is only used " "if the KV cache dtype is FP8 and on ROCm (AMD GPU).") parser.add_argument("--quantized_model", help="Specify the directory containing a single quantized HF model. " diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index c365f08b021ad..47560bcbe18c9 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -25,7 +25,7 @@ def main(args: argparse.Namespace): dtype=args.dtype, enforce_eager=args.enforce_eager, kv_cache_dtype=args.kv_cache_dtype, - kv_cache_scales_path=args.kv_cache_scales_path, + scales_path=args.scales_path, ) sampling_params = SamplingParams( @@ -128,10 +128,10 @@ def run_to_completion(profile_dir: Optional[str] = None): 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') parser.add_argument( - '--kv-cache-scales-path', + '--scales-path', type=str, default=None, - help='Path to the JSON files containing the KV cache scaling factors. ' + help='Path to the JSON file containing the KV cache scaling factors. ' 'This should generally be supplied, when KV cache dtype is FP8. Otherwise, ' 'KV cache scaling factors default to 1.0, which may cause accuracy issues. ' 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 03e624fa91cf4..898abd4fbbbfe 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -301,7 +301,7 @@ def main(args: argparse.Namespace): '--scales-path', type=str, default=None, - help='Path to the JSON files containing the KV cache scaling factors. ' + help='Path to the JSON file containing the KV cache scaling factors. ' 'This should generally be supplied, when KV cache dtype is FP8. Otherwise, ' 'KV cache scaling factors default to 1.0, which may cause accuracy issues. ' 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' diff --git a/tests/fp8_kv/README.md b/tests/fp8_kv/README.md index fde6e9d6187c3..81a12048f9d04 100644 --- a/tests/fp8_kv/README.md +++ b/tests/fp8_kv/README.md @@ -36,16 +36,14 @@ The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found [ # prerequisites: # - Quantized HF LLaMa 2 model python3 3rdparty/quantizer/extract_scales.py --help -Useage: extract_scales.py [-h] --quantized_model QUANTIZED_MODEL [--cache_dir CACHE_DIR] [--load_format {auto,safetensors,npz,pt}] [--revision REVISION] [--output_dir OUTPUT_DIR] [--output_name OUTPUT_NAME] [--tp_size TP_SIZE] +Useage: extract_scales.py [-h] --quantized_model QUANTIZED_MODEL [--load_format {auto,safetensors,npz,pt}] [--output_dir OUTPUT_DIR] [--output_name OUTPUT_NAME] [--tp_size TP_SIZE] KV Scale Extraction Example optional arguments: --quantized_model: Specify either the local path to, or name of, a quantized HF model. It is expected that the quantization format is FP8_E4M3, for use on ROCm (AMD GPU). Optional arguments: ---cache_dir: Specify a cache directory to use in the event of a HF model download. (Default: None) --load_format: Specify the format of the model's tensor files containing the KV cache scaling factors. (Choices: auto, safetensors, npz, pt; Default: auto) ---revision: Specify the model's revision number. (Default: None) --output_dir: Specify the output directory. By default the KV cache scaling factors will be saved in the model directory. (Default: None) --output_name: Specify the output filename. (Default: kv_cache_scales.json) --tp_size: Specify the tensor-parallel (TP) size that the quantized model should correspond to. If specified, during KV cache scaling factor extraction the observed TP size will be checked against this and an error will be raised if there is a mismatch. (Default: None) @@ -55,7 +53,7 @@ Example: python3 3rdparty/quantizer/extract_scales.py --model --tp_size --output_dir ``` ### 4. Load KV Cache Scaling Factors into VLLM. -This script evaluates the inference throughput of language models using various backends such as vLLM. It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for kv cache scaling factors to be utilized for FP8. +This script evaluates the inference throughput of language models using various backends such as vLLM. It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for KV cache scaling factors to be utilized for FP8. ```python # prerequisites: # - LLaMa 2 kv_cache_scales.json file @@ -65,7 +63,7 @@ usage: benchmark_throughput.py [-h] [--backend {vllm,hf,mii}] [--dataset DATASET [--tokenizer TOKENIZER] [--quantization {awq,gptq,squeezellm,None}] [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--n N] [--use-beam-search] [--num-prompts NUM_PROMPTS] [--seed SEED] [--hf-max-batch-size HF_MAX_BATCH_SIZE] [--trust-remote-code] [--max-model-len MAX_MODEL_LEN] [--dtype {auto,half,float16,bfloat16,float,float32}] [--enforce-eager] [--kv-cache-dtype {auto,fp8}] - [--kv-cache-scales-path KV_CACHE_SCALES_PATH] + [--scales-path KV_CACHE_SCALES_PATH] Benchmark Throughput Example optional arguments: @@ -88,9 +86,9 @@ optional arguments: --dtype {auto,half,float16,bfloat16,float,float32} data type for model weights and activations. The "auto" option will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models. --enforce-eager enforce eager execution --kv-cache-dtype {auto,fp8} Data type for kv cache storage. If "auto", will use model data type. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported ```for common inference criteria. - --kv-cache-scales-path KV_CACHE_SCALES_PATH Path to the JSON files containing the KV cache scaling factors. This should generally be supplied, when KV cache dtype is FP8. Otherwise, KV cache scaling factors default to 1.0, which may cause accuracy issues. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria. + --scales-path KV_CACHE_SCALES_PATH Path to the JSON file containing the KV cache scaling factors. This should generally be supplied, when KV cache dtype is FP8. Otherwise, KV cache scaling factors default to 1.0, which may cause accuracy issues. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria. ``` ``` Example: -python3 benchmarks/benchmark_throughput.py --input-len --output-len -tp --kv-cache-dtype fp8 --kv-cache-scales-path +python3 benchmarks/benchmark_throughput.py --input-len --output-len -tp --kv-cache-dtype fp8 --scales-path ```python diff --git a/vllm/config.py b/vllm/config.py index d21add66105b9..bc46cf1cf95d3 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -27,7 +27,7 @@ class ModelConfig: downloading the model and tokenizer. download_dir: Directory to download and load the weights, default to the default cache directory of huggingface. - scales_path: Path to file containing scaling factors. Used to load + scales_path: Path to JSON file containing scaling factors. Used to load KV cache scaling factors into the model when KV cache type is FP8_E4M3 on ROCm (AMD GPU). In the future these will also be used to load activation and weight scaling factors when the model dtype is diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 64fb58f4ace7e..8c24a77d569d3 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -136,7 +136,7 @@ def add_cli_args( '--scales-path', type=str, default=None, - help='Path to the JSON file containing scaling factors. ' + help='Path to the JSON file containing the KV cache scaling factors. ' 'This should generally be supplied, when KV cache dtype is FP8. Otherwise, ' 'KV cache scaling factors default to 1.0, which may cause accuracy issues. ' 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 0019e9b94b3a2..59e18cc82fd20 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -85,7 +85,7 @@ def __init__( f"quantization={model_config.quantization}, " f"enforce_eager={model_config.enforce_eager}, " f"kv_cache_dtype={cache_config.cache_dtype}, " - f"kv_cache_scales_path={model_config.kv_cache_scales_path}, " + f"scales_path={model_config.scales_path}, " f"seed={model_config.seed})") # TODO(woosuk): Print more configs in debug mode. diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index 10d97359278dd..f554a456b09b6 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -5,7 +5,7 @@ import json import os from collections import defaultdict -from typing import Any, Iterator, List, Optional, Tuple +from typing import Any, Iterable, Iterator, List, Optional, Tuple from huggingface_hub import snapshot_download, HfFileSystem import numpy as np @@ -266,8 +266,7 @@ def kv_cache_scales_loader(filename: str, tp_rank: int, tp_size: int, num_hidden_layers: int, - model_type: Optional[str], - rank_keyword="rank") -> Iterator[Tuple[int, float]]: + model_type: Optional[str]) -> Iterable[Tuple[int, float]]: """ A simple utility to read in KV cache scaling factors that have been previously serialized to disk. Used by the model to populate the appropriate @@ -293,13 +292,21 @@ def kv_cache_scales_loader(filename: str, assert isinstance(schema, dict), malformed_schema_str if schema["model_type"] is not None: assert model_type == schema["model_type"], f"Model type is {model_type} but loaded " \ - f"scaling factors belonging to different model type {schema["model_type"]}!" + f"scaling factors belonging to different model type {schema['model_type']}!" assert isinstance(schema["kv_cache"], dict), malformed_schema_str assert schema["kv_cache"]["dtype"] == "fp8", "Loaded scaling factors intended for KV " \ - f"cache dtype = {schema["kv_cache"]["dtype"]} rather than FP8!" + f"cache dtype = {schema['kv_cache']['dtype']} rather than FP8!" assert isinstance(schema["kv_cache"]["scaling_factor"], dict), malformed_schema_str + raw_rank_scales_map = schema["kv_cache"]["scaling_factor"] + # The keys in raw_rank_scales_map should be strings with the format + # f"{rank_keyword}{tp_rank}", where rank_keyword is an alphabetical string shared + # amongst all keys and tp_rank is a numeric string. Thus, recovering the alphabetical + # components of any key should return rank_keyword + rank_keyword = "".join(char for char in + next(iter(raw_rank_scales_map.keys())) + if char.isalpha()) rank_scales_map = {int(rank.replace(rank_keyword, "")) : scales_map - for rank, scales_map in schema["kv_cache"]["scaling_factor"].items()} + for rank, scales_map in raw_rank_scales_map.items()} assert len(rank_scales_map) != 0, "Loaded dictionary is empty." loaded_tp_size = max(rank_scales_map.keys()) + 1 assert loaded_tp_size == tp_size, f"Loaded dictionary has TP size {loaded_tp_size} " \ @@ -327,7 +334,7 @@ def kv_cache_scales_loader(filename: str, except Exception as e: logger.error(f"An error occurred while reading '{filename}': {e}") # This section is reached if and only if any of the excepts are hit - # Return an empty iterator (tuple) => no KV cache scales are loaded + # Return an empty iterable (tuple) => no KV cache scales are loaded # which effectively defaults to 1.0 scales logger.warn(f"Defaulting to KV cache scaling factors = 1.0 for all layers in TP rank {tp_rank}" " as an error occurred during loading.") From ef85f98ad5e392e5067115718b0f877fddd90711 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Thu, 7 Mar 2024 18:04:54 +0000 Subject: [PATCH 120/159] Address reviewer comments --- 3rdparty/quantizer/extract_scales.py | 62 ++++++++-------- .../llama2-7b-fp8-kv/kv_cache_scales.json | 72 ++++++++++--------- vllm/model_executor/models/llama.py | 1 - vllm/model_executor/weight_utils.py | 6 +- 4 files changed, 70 insertions(+), 71 deletions(-) diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py index 7fba41905ce38..aa32e3ed37101 100644 --- a/3rdparty/quantizer/extract_scales.py +++ b/3rdparty/quantizer/extract_scales.py @@ -176,19 +176,21 @@ def _metadata_extractor(quantized_model_dir: str, metadata_extract_fns: Dict[str, Callable[[Dict[str, Any]], Any]]) \ -> Dict[str, Any]: """ - Given the quantized model directory, tries to extract metadata from the JSON files in this - directory. It is assumed that each JSON file corresponds to a JSON-serialization of some - dictionary (the "JSON-dictionary"). The metadata fields to be extracted and how to extract - it is specified in metadata_extract_fns, which is a dictionary mapping metadata field names - to extraction functions. - Extraction functions should take in a JSON-dictionary as the sole argument and return the - metadata corresponding to that function. The extraction function is allowed to raise - exceptions, but the special exceptions KeyError or ValueError must be raised if and only if - the metadata field cannot be extracted from the current JSON-dictionary yet it remains - possible that said metadata field can be found in another JSON-dictionary. - Returns a dictionary mapping metadata fields to their extracted data. The dictionary's keys - are exactly the same as those in `metadata_extract_fns`: if any fields could not be extracted, - their value is set to None and a warning is printed. + Given a directory containing quantized model files, this function aims to extract metadata + from the JSON files within this directory. Each JSON file is expected to represent a + dictionary in JSON format (referred to as a "JSON-dictionary"). Metadata extraction is + defined by a dictionary called metadata_extract_fns, where each metadata field name is + mapped to an extraction function. + + These extraction functions are designed to take a JSON-dictionary as their only argument + and return the corresponding metadata. While extraction functions are permitted to raise + exceptions, they should only raise a KeyError or ValueError if the metadata field cannot + be extracted from the current JSON-dictionary, yet there's a possibility of finding it in + another JSON-dictionary. + + The function returns a dictionary that maps metadata fields to their extracted data. The + keys of this dictionary correspond exactly to those in metadata_extract_fns. If any fields + fail to be extracted, their corresponding values are set to None, and a warning is printed. """ if not os.path.isdir(quantized_model_dir): raise FileNotFoundError(f"The quantized model directory `{quantized_model_dir}` " @@ -229,7 +231,8 @@ def _metadata_extractor(quantized_model_dir: str, # Warn if we cannot find any of the requested metadata for metadata_name in metadata_extract_fns: if metadata_name not in result: - print(f"Unable to find requested metadata field `{metadata_name}`!") + print(f"WARNING: Unable to find requested metadata field `{metadata_name}`, " + "setting it to None.") result[metadata_name] = None return result @@ -241,16 +244,15 @@ def main(args): "tp_size": lambda json_dict: int(json_dict["tensor_parallel"]), "model_dtype": lambda json_dict: json_dict["dtype"] } - metadata_dict = _metadata_extractor(args.quantized_model, metadata_extract_fns) - - hf_tensor_files, use_safetensors = _prepare_hf_weights(args.quantized_model, args.load_format) + recovered_metadata = _metadata_extractor(args.quantized_model, metadata_extract_fns) if args.tp_size is not None: - metadata_tp_size = metadata_dict["tp_size"] + metadata_tp_size = recovered_metadata["tp_size"] if metadata_tp_size is not None: assert args.tp_size == metadata_tp_size, "User expected TP world size = " \ f"{args.tp_size} but found TP world size = {metadata_tp_size} from metadata!" - expected_tp_size = args.tp_size or metadata_dict["tp_size"] + expected_tp_size = args.tp_size or recovered_metadata["tp_size"] rank_keyword = "rank" + hf_tensor_files, use_safetensors = _prepare_hf_weights(args.quantized_model, args.load_format) rank_scales_map = _kv_scales_extractor(hf_tensor_files, use_safetensors, rank_keyword, expected_tp_size) # Postprocess: formatting to the current schema. Consider pulling it out into a dedicated @@ -259,24 +261,16 @@ def main(args): {k: scale[k] for k in sorted(scale.keys())} for rank, scale in rank_scales_map.items() } - model_dtype = metadata_dict["model_dtype"] - # Consider unifying and formalizing this into its own class (and other necessary subclasses) in - # the future - schema = { "model_type": metadata_dict["model_type"], + # Consider generalizing and formalizing this into its own class (and other necessary + # subclasses) in the future + schema = { "model_type": recovered_metadata["model_type"], "kv_cache": { - "dtype": "fp8" if len(rank_scales_map) > 0 else model_dtype, + "dtype": "float8_e4m3fn" if len(rank_scales_map) > 0 \ + else recovered_metadata["model_dtype"], "scaling_factor": rank_scales_map }, - # The fields below this comment are not used or checked for now - # but will be in the future - "activation": { - "dtype": model_dtype, - "scaling_factor": None, - }, - "weight": { - "dtype": model_dtype, - "scaling_factor": None - } + # TODO: Expand this with activation and weights scaling factors when they + # are used in the future } if args.output_dir is None: diff --git a/tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json b/tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json index 3a29f7e321391..de31ee9fdd1db 100644 --- a/tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json +++ b/tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json @@ -1,36 +1,42 @@ { - "0": { - "0": 0.0152239128947258, - "1": 0.0188860222697258, - "2": 0.0354178324341774, - "3": 0.0376674123108387, - "4": 0.0418526791036129, - "5": 0.0433175228536129, - "6": 0.0397600457072258, - "7": 0.0424455925822258, - "8": 0.0415387861430645, - "9": 0.0408412404358387, - "10": 0.0395856611430645, - "11": 0.0377371683716774, - "12": 0.0400739423930645, - "13": 0.040771484375, - "14": 0.0393415205180645, - "15": 0.0369001142680645, - "16": 0.03857421875, - "17": 0.0387486070394516, - "18": 0.0403180830180645, - "19": 0.0396205373108387, - "20": 0.0375627800822258, - "21": 0.0407366082072258, - "22": 0.0432477705180645, - "23": 0.0377022884786129, - "24": 0.0399693101644516, - "25": 0.0374581478536129, - "26": 0.0413295216858387, - "27": 0.0442243330180645, - "28": 0.0424804724752903, - "29": 0.0456891767680645, - "30": 0.0409109964966774, - "31": 0.0482352152466774 + "model_type": "llama", + "kv_cache": { + "dtype": "float8_e4m3fn", + "scaling_factor": { + "rank0": { + "0": 0.0152239128947258, + "1": 0.0188860222697258, + "2": 0.0354178324341774, + "3": 0.0376674123108387, + "4": 0.0418526791036129, + "5": 0.0433175228536129, + "6": 0.0397600457072258, + "7": 0.0424455925822258, + "8": 0.0415387861430645, + "9": 0.0408412404358387, + "10": 0.0395856611430645, + "11": 0.0377371683716774, + "12": 0.0400739423930645, + "13": 0.040771484375, + "14": 0.0393415205180645, + "15": 0.0369001142680645, + "16": 0.03857421875, + "17": 0.0387486070394516, + "18": 0.0403180830180645, + "19": 0.0396205373108387, + "20": 0.0375627800822258, + "21": 0.0407366082072258, + "22": 0.0432477705180645, + "23": 0.0377022884786129, + "24": 0.0399693101644516, + "25": 0.0374581478536129, + "26": 0.0413295216858387, + "27": 0.0442243330180645, + "28": 0.0424804724752903, + "29": 0.0456891767680645, + "30": 0.0409109964966774, + "31": 0.0482352152466774 + } + } } } \ No newline at end of file diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index f1b4ba41ee29a..42c0b0f1a1cf6 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -49,7 +49,6 @@ from vllm.sequence import SamplerOutput from vllm.config import LoRAConfig from vllm.utils import is_hip -from vllm.utils import is_hip KVCache = Tuple[torch.Tensor, torch.Tensor] diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index f554a456b09b6..8510866cb3cdd 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -294,8 +294,8 @@ def kv_cache_scales_loader(filename: str, assert model_type == schema["model_type"], f"Model type is {model_type} but loaded " \ f"scaling factors belonging to different model type {schema['model_type']}!" assert isinstance(schema["kv_cache"], dict), malformed_schema_str - assert schema["kv_cache"]["dtype"] == "fp8", "Loaded scaling factors intended for KV " \ - f"cache dtype = {schema['kv_cache']['dtype']} rather than FP8!" + assert schema["kv_cache"]["dtype"] == "float8_e4m3fn", "Loaded scaling factors intended " \ + f"for KV cache dtype = {schema['kv_cache']['dtype']} rather than FP8!" assert isinstance(schema["kv_cache"]["scaling_factor"], dict), malformed_schema_str raw_rank_scales_map = schema["kv_cache"]["scaling_factor"] # The keys in raw_rank_scales_map should be strings with the format @@ -307,7 +307,7 @@ def kv_cache_scales_loader(filename: str, if char.isalpha()) rank_scales_map = {int(rank.replace(rank_keyword, "")) : scales_map for rank, scales_map in raw_rank_scales_map.items()} - assert len(rank_scales_map) != 0, "Loaded dictionary is empty." + assert len(rank_scales_map) != 0, "Loaded KV scales dictionary is empty." loaded_tp_size = max(rank_scales_map.keys()) + 1 assert loaded_tp_size == tp_size, f"Loaded dictionary has TP size {loaded_tp_size} " \ f"but LLM engine is currently running with TP size {tp_size}." From d8b28438d839fd2cf49ad5490782ed1a93dd1c8e Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Thu, 7 Mar 2024 18:11:31 +0000 Subject: [PATCH 121/159] Address Greg's strong type checking :) --- 3rdparty/quantizer/extract_scales.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py index aa32e3ed37101..9c90d8944c35e 100644 --- a/3rdparty/quantizer/extract_scales.py +++ b/3rdparty/quantizer/extract_scales.py @@ -7,7 +7,7 @@ import os from safetensors.torch import safe_open import torch -from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple # Adapted from vllm/model_executor/weight_utils.py @@ -90,7 +90,7 @@ def _hf_tensorfile_iterator(filename: str, load_format: str, torch.cuda.empty_cache() -def _kv_scales_extractor(hf_tensor_files: Iterator[str], +def _kv_scales_extractor(hf_tensor_files: Iterable[str], use_safetensors: bool, rank_keyword: str = "rank", expected_tp_size: Optional[int] = None) -> Dict[int, Dict[int, float]]: From 52df60398af26a5e1394eca90a5ae6e9f593d0e1 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Thu, 7 Mar 2024 19:55:03 +0000 Subject: [PATCH 122/159] Add an additional TODO --- 3rdparty/quantizer/extract_scales.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py index 9c90d8944c35e..30b3354a9b76c 100644 --- a/3rdparty/quantizer/extract_scales.py +++ b/3rdparty/quantizer/extract_scales.py @@ -310,7 +310,8 @@ def main(args): default=None) parser.add_argument("--output_name", help="Optionally specify the output filename.", - default="kv_cache_scales.json") + # TODO: Change this once additional scaling factors are enabled + default="kv_cache_scales.json") parser.add_argument("--tp_size", help="Optionally specify the tensor-parallel (TP) size that the " "quantized model should correspond to. If specified, during KV " From 18c55d2af5730a9e9a3eec8d414e72dbb270fc56 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Thu, 7 Mar 2024 21:41:01 +0000 Subject: [PATCH 123/159] Fix OOM bug in quantize script, remove extraneous model_export --- 3rdparty/quantizer/quantize.py | 14 +-- .../llama2-70b-fp8-kv/kv_cache_scales.json | 90 +++++++++++++++++++ 2 files changed, 91 insertions(+), 13 deletions(-) create mode 100644 tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json diff --git a/3rdparty/quantizer/quantize.py b/3rdparty/quantizer/quantize.py index a68f21a89c65d..dfc21e7fc3918 100644 --- a/3rdparty/quantizer/quantize.py +++ b/3rdparty/quantizer/quantize.py @@ -193,7 +193,7 @@ def get_calib_dataloader(data="cnn_dailymail", batch_encoded = tokenizer.batch_encode_plus(dataset, return_tensors="pt", - padding=True, + padding="max_length", truncation=True, max_length=block_size) if device: @@ -312,18 +312,6 @@ def main(args): export_tensorrt_llm_config=False, export_npz=export_npz) - # export npz (reference) - export_model_config(model, - model_type, - getattr(torch, args.dtype), - export_dir=export_path, - inference_tensor_parallel=args.tp_size, - inference_pipeline_parallel=args.pp_size, - # export_tensorrt_llm_config=(not export_npz), - export_tensorrt_llm_config=False, - # export_npz=export_npz, - export_npz=True) - # Workaround for wo quantization if args.qformat in ["int8_wo", "int4_wo", "full_prec"]: with open(f"{export_path}/config.json", 'r') as f: diff --git a/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json b/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json new file mode 100644 index 0000000000000..c7d98dcd14697 --- /dev/null +++ b/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json @@ -0,0 +1,90 @@ +{ + "model_type": "llama", + "kv_cache": { + "dtype": "float8_e4m3fn", + "scaling_factor": { + "rank0": { + "0": 0.0230364128947258, + "1": 0.01979283057153225, + "2": 0.0241350457072258, + "3": 0.0308314748108387, + "4": 0.0430733822286129, + "5": 0.0370396226644516, + "6": 0.0306222103536129, + "7": 0.0357491634786129, + "8": 0.0358189195394516, + "9": 0.0443289652466774, + "10": 0.0433175228536129, + "11": 0.0416782945394516, + "12": 0.0366908498108387, + "13": 0.0432477705180645, + "14": 0.0410505048930645, + "15": 0.0457589291036129, + "16": 0.0418526791036129, + "17": 0.0432477705180645, + "18": 0.0469447560608387, + "19": 0.0514787957072258, + "20": 0.0541294664144516, + "21": 0.0587681382894516, + "22": 0.0625, + "23": 0.0585588738322258, + "24": 0.0600237175822258, + "25": 0.0588030144572258, + "26": 0.0531180277466774, + "27": 0.06396484375, + "28": 0.0603027381002903, + "29": 0.0582101047039032, + "30": 0.0625348836183548, + "31": 0.0585588738322258, + "32": 0.0582798570394516, + "33": 0.0575125589966774, + "34": 0.0590820349752903, + "35": 0.0614188089966774, + "36": 0.0631975457072258, + "37": 0.0615931935608387, + "38": 0.0601283498108387, + "39": 0.0571986623108387, + "40": 0.0670340433716774, + "41": 0.0523507259786129, + "42": 0.0547223798930645, + "43": 0.0631975457072258, + "44": 0.0663713738322258, + "45": 0.0603376142680645, + "46": 0.0652204304933548, + "47": 0.0734514519572258, + "48": 0.0693708211183548, + "49": 0.0725446492433548, + "50": 0.0627790242433548, + "51": 0.0691266804933548, + "52": 0.0688825398683548, + "53": 0.068429134786129, + "54": 0.0605119988322258, + "55": 0.0799386203289032, + "56": 0.0853097140789032, + "57": 0.0661969929933548, + "58": 0.0689871683716774, + "59": 0.0724051371216774, + "60": 0.0541643425822258, + "61": 0.0626743882894516, + "62": 0.0628487765789032, + "63": 0.0607212632894516, + "64": 0.0589076466858387, + "65": 0.0451660193502903, + "66": 0.0453055277466774, + "67": 0.0414341539144516, + "68": 0.0385044664144516, + "69": 0.0414341539144516, + "70": 0.0466308631002903, + "71": 0.0399693101644516, + "72": 0.0437011756002903, + "73": 0.0434221550822258, + "74": 0.0428989976644516, + "75": 0.0401785746216774, + "76": 0.0431082621216774, + "77": 0.0484444759786129, + "78": 0.0417829267680645, + "79": 0.0418178029358387 + } + } + } +} \ No newline at end of file From 7d0fa2f01251aea8eec2b5c16bba30df35fcc408 Mon Sep 17 00:00:00 2001 From: AdrianAbeyta Date: Thu, 7 Mar 2024 21:57:24 +0000 Subject: [PATCH 124/159] Fix rocm build conditions --- setup.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 41fc7f4bb6ead..48961108b8322 100644 --- a/setup.py +++ b/setup.py @@ -510,6 +510,13 @@ def get_requirements() -> List[str]: ext_modules = [] package_data["vllm"].append("*.so") +cmdclass={"build_ext": build_ext} if not (_is_neuron() or _is_hip()) else {} + +if not (_is_neuron() or _is_hip()): + distclass = BinaryDistribution +else: + distclass = None + setuptools.setup( name="vllm", version=get_vllm_version(), @@ -537,7 +544,7 @@ def get_requirements() -> List[str]: python_requires=">=3.8", install_requires=get_requirements(), ext_modules=ext_modules, - cmdclass={"build_ext": build_ext} if not _is_neuron() else {}, - distclass=BinaryDistribution, + cmdclass=cmdclass, + distclass=distclass, package_data=package_data, ) From e7db6afabf0ced19a2db1e1ba335ebbca04f58e2 Mon Sep 17 00:00:00 2001 From: AdrianAbeyta Date: Thu, 7 Mar 2024 22:01:20 +0000 Subject: [PATCH 125/159] Keep previous build flow for neuron --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 48961108b8322..286b90fdf6fbc 100644 --- a/setup.py +++ b/setup.py @@ -512,7 +512,7 @@ def get_requirements() -> List[str]: cmdclass={"build_ext": build_ext} if not (_is_neuron() or _is_hip()) else {} -if not (_is_neuron() or _is_hip()): +if not _is_hip(): distclass = BinaryDistribution else: distclass = None From ca1b39c0addf86ad2e90c6d12be73df1cc9553f5 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 7 Mar 2024 11:42:42 -0800 Subject: [PATCH 126/159] Measure model memory usage (#3120) --- vllm/utils.py | 25 +++++++++++++++++++++++++ vllm/worker/model_runner.py | 18 ++++++++++++------ 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/vllm/utils.py b/vllm/utils.py index 07d29bd8fe303..3dbe96acecfd5 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -3,6 +3,7 @@ import socket import subprocess import uuid +import gc from platform import uname from typing import List, Tuple, Union from packaging.version import parse, Version @@ -309,3 +310,27 @@ def create_kv_caches_with_random( f"Does not support value cache of type {cache_dtype}") value_caches.append(value_cache) return key_caches, value_caches + + +class measure_cuda_memory: + + def __init__(self, device=None): + self.device = device + + def current_memory_usage(self) -> float: + # Return the memory usage in bytes. + torch.cuda.reset_peak_memory_stats(self.device) + mem = torch.cuda.max_memory_allocated(self.device) + return mem + + def __enter__(self): + self.initial_memory = self.current_memory_usage() + # This allows us to call methods of the context manager if needed + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.final_memory = self.current_memory_usage() + self.consumed_memory = self.final_memory - self.initial_memory + + # Force garbage collection + gc.collect() diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 41603d9c22ffc..5afe11d38f1d0 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -21,7 +21,7 @@ from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager from vllm.lora.layers import LoRAMapping from vllm.lora.request import LoRARequest -from vllm.utils import in_wsl +from vllm.utils import in_wsl, measure_cuda_memory logger = init_logger(__name__) @@ -85,11 +85,17 @@ def __init__( self.model_config.enforce_eager = True def load_model(self) -> None: - self.model = get_model(self.model_config, - self.device_config, - lora_config=self.lora_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config) + with measure_cuda_memory() as m: + self.model = get_model(self.model_config, + self.device_config, + lora_config=self.lora_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config) + + self.model_memory_usage = m.consumed_memory + logger.info( + f"Loading model weights took {self.model_memory_usage / float(2**30):.4f} GB" + ) vocab_size = self.model.config.vocab_size From fd6e57ed1283f4a0dfc1e903fdf6ce59014137b7 Mon Sep 17 00:00:00 2001 From: jacobthebanana <50071502+jacobthebanana@users.noreply.github.com> Date: Thu, 7 Mar 2024 18:03:22 -0500 Subject: [PATCH 127/159] Possible fix for conflict between Automated Prefix Caching (#2762) and multi-LoRA support (#1804) (#3263) --- tests/test_cache_block_hashing.py | 46 +++++++++++++++++++++---------- vllm/sequence.py | 3 +- 2 files changed, 33 insertions(+), 16 deletions(-) diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index c2067e52b59c0..fb541f38f3489 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -2,8 +2,11 @@ Run `pytest tests/test_cache_block_hashing.py`. """ +from typing import List, Optional + import pytest +from vllm.lora.request import LoRARequest from vllm.transformers_utils.tokenizer import TokenizerGroup from vllm.sequence import Sequence @@ -36,7 +39,10 @@ def flatten_2d(li): @pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("block_size", [16]) @pytest.mark.parametrize("max_num_seqs", [256]) -def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int): +@pytest.mark.parametrize("concurrent_lora_int_ids", + [[None], [1], [None, 1], [None, 1, 2], [1, 2]]) +def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int, + concurrent_lora_int_ids: List[Optional[int]]): tokenizer = TokenizerGroup( tokenizer_id="facebook/opt-125m", @@ -48,20 +54,30 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int): hashes = [] for prefix in prefixes: - hashes.append([]) - prompts = [prefix + prompt for prompt in sample_prompts] - seq_id = 0 - for prompt in prompts: - hashes[-1].append([]) - prompt_token_ids = tokenizer.encode(prompt) - seq = Sequence(seq_id, prompt, prompt_token_ids, block_size, - tokenizer.tokenizer.eos_token_id) - - num_blocks = len(prompt_token_ids) // block_size - for idx in range(num_blocks): - hashes[-1][-1].append(seq.hash_of_block(idx)) - - seq_id += 1 + for lora_int_id in concurrent_lora_int_ids: + lora_request = None + + if lora_int_id is not None: + lora_request = LoRARequest( + f"example_lora_{lora_int_id}", + lora_int_id, + f"example/path/to/lora_{lora_int_id}", + ) + + hashes.append([]) + prompts = [prefix + prompt for prompt in sample_prompts] + seq_id = 0 + for prompt in prompts: + hashes[-1].append([]) + prompt_token_ids = tokenizer.encode(prompt) + seq = Sequence(seq_id, prompt, prompt_token_ids, block_size, + tokenizer.tokenizer.eos_token_id, lora_request) + + num_blocks = len(prompt_token_ids) // block_size + for idx in range(num_blocks): + hashes[-1][-1].append(seq.hash_of_block(idx)) + + seq_id += 1 # Check that hashes made with two prefixes with different first blocks are # different everywhere. diff --git a/vllm/sequence.py b/vllm/sequence.py index 19dafe3cb0fc9..fee96a875dde5 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -175,7 +175,8 @@ def hash_of_block(self, logical_idx: int) -> int: # TODO: The current hashing function is O(L^2). We should optimize # this in the future. num_tokens = self.num_hashed_tokens_of_block(logical_idx) - return hash(tuple(self.data.get_token_ids()[0:num_tokens])) + return hash( + (tuple(self.data.get_token_ids()[0:num_tokens]), self.lora_int_id)) def num_hashed_tokens_of_block(self, logical_idx: int): return logical_idx * self.block_size + self.block_size From 90c2cd4187f5084264149c4a43e19dd8b9a5b371 Mon Sep 17 00:00:00 2001 From: AdrianAbeyta Date: Thu, 7 Mar 2024 23:32:21 +0000 Subject: [PATCH 128/159] Update fp8 examples --- tests/fp8_kv/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fp8_kv/README.md b/tests/fp8_kv/README.md index 0f0155dce3a0d..c60bddc1151c0 100644 --- a/tests/fp8_kv/README.md +++ b/tests/fp8_kv/README.md @@ -52,7 +52,7 @@ Optional arguments: ``` ```python Example: -python3 3rdparty/quantizer/extract_scales.py --model --tp_size --output_dir +python3 3rdparty/quantizer/extract_scales.py --quantized_model --tp_size --output_dir ``` ### 4. Load KV Cache Scaling Factors into VLLM. This script evaluates the inference throughput of language models using various backends such as vLLM. It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for KV cache scaling factors to be utilized for FP8. From 9e6144a6a8ee724e47626b8ffb022e85c9fbc84b Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 8 Mar 2024 10:52:20 -0800 Subject: [PATCH 129/159] [FIX] Make `flash_attn` optional (#3269) --- .gitignore | 3 - setup.py | 58 +------------------ vllm/__init__.py | 30 +++------- .../layers/attention/attention.py | 37 ++++++++++-- .../layers/attention/backends/flash_attn.py | 1 - 5 files changed, 39 insertions(+), 90 deletions(-) diff --git a/.gitignore b/.gitignore index 6ff62f1c75806..b1513ef0ddb0c 100644 --- a/.gitignore +++ b/.gitignore @@ -185,6 +185,3 @@ hip_compat.h # Benchmark dataset *.json - -# Third-party Python packages. -vllm/thirdparty_files/ diff --git a/setup.py b/setup.py index 286b90fdf6fbc..879ffaa3ae732 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,6 @@ import os import re import subprocess -import sys import warnings from pathlib import Path from typing import List, Set @@ -15,8 +14,6 @@ from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME ROOT_DIR = os.path.dirname(__file__) -# This is a temporary directory to store third-party packages. -THIRDPARTY_SUBDIR = "vllm/thirdparty_files" # If you are developing the C++ backend of vLLM, consider building vLLM with # `python setup.py develop` since it will give you incremental builds. @@ -341,61 +338,9 @@ def get_torch_arch_list() -> Set[str]: "nvcc": NVCC_FLAGS_PUNICA, }, )) -elif _is_hip(): - amd_archs = os.getenv("GPU_ARCHS") - if amd_archs is None: - amd_archs = get_amdgpu_offload_arch() - for arch in amd_archs.split(";"): - if arch not in ROCM_SUPPORTED_ARCHS: - raise RuntimeError( - f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}" - f"amdgpu_arch_found: {arch}") - NVCC_FLAGS += [f"--offload-arch={arch}"] - NVCC_FLAGS += ["-DENABLE_FP8_E4M3"] - elif _is_neuron(): neuronxcc_version = get_neuronxcc_version() - # Download the FlashAttention package. - # Adapted from https://github.com/ray-project/ray/blob/f92928c9cfcbbf80c3a8534ca4911de1b44069c0/python/setup.py#L518-L530 - flash_attn_version = "2.5.6" - install_dir = os.path.join(ROOT_DIR, THIRDPARTY_SUBDIR) - subprocess.check_call( - [ - sys.executable, - "-m", - "pip", - "install", - "-q", - f"--target={install_dir}", - "einops", # Dependency of flash-attn. - f"flash-attn=={flash_attn_version}", - "--no-dependencies", # Required to avoid re-installing torch. - ], - env=dict(os.environ, CC="gcc"), - ) - - # Copy the FlashAttention package into the vLLM package after build. - class build_ext(BuildExtension): - - def run(self): - super().run() - target_dir = os.path.join(self.build_lib, THIRDPARTY_SUBDIR) - if not os.path.exists(target_dir): - os.makedirs(target_dir) - self.copy_tree(install_dir, target_dir) - - class BinaryDistribution(setuptools.Distribution): - - def has_ext_modules(self): - return True - -else: - build_ext = BuildExtension - BinaryDistribution = setuptools.Distribution - if _is_neuron(): - neuronxcc_version = get_neuronxcc_version() - vllm_extension_sources = [ "csrc/cache_kernels.cu", "csrc/attention/attention_kernels.cu", @@ -544,7 +489,6 @@ def get_requirements() -> List[str]: python_requires=">=3.8", install_requires=get_requirements(), ext_modules=ext_modules, - cmdclass=cmdclass, - distclass=distclass, + cmdclass={"build_ext": BuildExtension} if not _is_neuron() else {}, package_data=package_data, ) diff --git a/vllm/__init__.py b/vllm/__init__.py index 59f1345b58d42..f1e30f5eb6e6e 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -1,28 +1,12 @@ """vLLM: a high-throughput and memory-efficient inference engine for LLMs""" - -# Adapted from https://github.com/ray-project/ray/blob/f92928c9cfcbbf80c3a8534ca4911de1b44069c0/python/ray/__init__.py#L11 -def _configure_system(): - import os - import sys - - # Importing flash-attn. - thirdparty_files = os.path.join(os.path.abspath(os.path.dirname(__file__)), - "thirdparty_files") - sys.path.insert(0, thirdparty_files) - - -_configure_system() -# Delete configuration function. -del _configure_system - -from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs # noqa: E402 -from vllm.engine.async_llm_engine import AsyncLLMEngine # noqa: E402 -from vllm.engine.llm_engine import LLMEngine # noqa: E402 -from vllm.engine.ray_utils import initialize_cluster # noqa: E402 -from vllm.entrypoints.llm import LLM # noqa: E402 -from vllm.outputs import CompletionOutput, RequestOutput # noqa: E402 -from vllm.sampling_params import SamplingParams # noqa: E402 +from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs +from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm.engine.llm_engine import LLMEngine +from vllm.engine.ray_utils import initialize_cluster +from vllm.entrypoints.llm import LLM +from vllm.outputs import CompletionOutput, RequestOutput +from vllm.sampling_params import SamplingParams __version__ = "0.3.3" diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py index 830e82e10f7ad..724dd0511c5aa 100644 --- a/vllm/model_executor/layers/attention/attention.py +++ b/vllm/model_executor/layers/attention/attention.py @@ -1,12 +1,16 @@ """Attention layer.""" +from functools import lru_cache from typing import List, Optional import torch import torch.nn as nn +from vllm.logger import init_logger from vllm.model_executor.input_metadata import InputMetadata from vllm.utils import is_hip +logger = init_logger(__name__) + class Attention(nn.Module): """Attention layer. @@ -30,17 +34,12 @@ def __init__( sliding_window: Optional[int] = None, ) -> None: super().__init__() - if (not is_hip() and torch.cuda.get_device_capability()[0] >= 8 and - torch.get_default_dtype() in (torch.float16, torch.bfloat16)): - # Ampere or later NVIDIA GPUs. - # NOTE(woosuk): FlashAttention does not support FP32. + if _use_flash_attn(): from vllm.model_executor.layers.attention.backends.flash_attn import FlashAttentionBackend self.backend = FlashAttentionBackend(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window) else: - # Turing and Volta NVIDIA GPUs or AMD GPUs. - # Or FP32 on any GPU. from vllm.model_executor.layers.attention.backends.xformers import XFormersBackend self.backend = XFormersBackend(num_heads, head_size, scale, num_kv_heads, alibi_slopes, @@ -57,3 +56,29 @@ def forward( ) -> torch.Tensor: return self.backend.forward(query, key, value, key_cache, value_cache, input_metadata) + + +@lru_cache(maxsize=1) +def _use_flash_attn() -> bool: + try: + import flash_attn # noqa: F401 + except ImportError: + logger.info("flash_attn is not found. Using xformers backend.") + return False + + if is_hip(): + # AMD GPUs. + return False + if torch.cuda.get_device_capability()[0] < 8: + # Volta and Turing NVIDIA GPUs. + logger.info("flash_attn is not supported on Turing or older GPUs. " + "Using xformers backend.") + return False + if torch.get_default_dtype() not in (torch.float16, torch.bfloat16): + logger.info( + "flash_attn only supports torch.float16 or torch.bfloat16. " + "Using xformers backend.") + return False + + logger.info("Using flash_attn backend.") + return True diff --git a/vllm/model_executor/layers/attention/backends/flash_attn.py b/vllm/model_executor/layers/attention/backends/flash_attn.py index 512f4e49c7eb2..4abe195f274a7 100644 --- a/vllm/model_executor/layers/attention/backends/flash_attn.py +++ b/vllm/model_executor/layers/attention/backends/flash_attn.py @@ -1,7 +1,6 @@ """Attention layer with Flash and PagedAttention.""" from typing import List, Optional -# NOTE(woosuk): This imports flash_attn under vllm/thirdparty_files/. from flash_attn import flash_attn_func import torch From fd01e9affd89885dd9a2517526d63b319325b0e9 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Fri, 8 Mar 2024 21:20:54 +0000 Subject: [PATCH 130/159] Fix setup.py up to where it should be before the excitement of the last 24h (upstream flashattn changes were broken) --- setup.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/setup.py b/setup.py index 879ffaa3ae732..c65186f14e0d3 100644 --- a/setup.py +++ b/setup.py @@ -455,13 +455,6 @@ def get_requirements() -> List[str]: ext_modules = [] package_data["vllm"].append("*.so") -cmdclass={"build_ext": build_ext} if not (_is_neuron() or _is_hip()) else {} - -if not _is_hip(): - distclass = BinaryDistribution -else: - distclass = None - setuptools.setup( name="vllm", version=get_vllm_version(), From 6edfbf19dfa0439abc7047ef066ddef0e7502bfc Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Fri, 8 Mar 2024 22:13:15 +0000 Subject: [PATCH 131/159] Fix missing enable FP8_E4M3 flag and cherry pick newest load convention --- setup.py | 1 + vllm/model_executor/models/llama.py | 18 ++++++------------ vllm/model_executor/weight_utils.py | 2 +- vllm/worker/model_runner.py | 4 ---- 4 files changed, 8 insertions(+), 17 deletions(-) diff --git a/setup.py b/setup.py index c65186f14e0d3..e5c71f87562fb 100644 --- a/setup.py +++ b/setup.py @@ -226,6 +226,7 @@ def get_torch_arch_list() -> Set[str]: if _is_hip(): rocm_arches = get_pytorch_rocm_arch() NVCC_FLAGS += ["--offload-arch=" + arch for arch in rocm_arches] + NVCC_FLAGS += ["-DENABLE_FP8_E4M3"] else: # First, check the TORCH_CUDA_ARCH_LIST environment variable. compute_capabilities = get_torch_arch_list() diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index aa4a05b220980..0f31af51abb79 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -399,27 +399,21 @@ def load_weights(self, # factors (or else raise an exception). Thus, handled exceptions should # make sure to leave KV cache scale factors in a known good (dummy) state def load_kv_cache_scales(self, scales_path: str) -> None: - # Initialize KV cache scales to dummy values first. These will be - # overwritten by the actual values if and only if the later loading - # process completes without error - self.load_dummy_kv_cache_scales() tp_size = get_tensor_model_parallel_world_size() tp_rank = get_tensor_model_parallel_rank() for layer_idx, scaling_factor in kv_cache_scales_loader( scales_path, tp_rank, tp_size, self.config.num_hidden_layers, self.config.__class__.model_type): - layer_paged_attn = self.model.layers[layer_idx].self_attn.attn + layer_paged_attn = self.model.layers[layer_idx].self_attn.attn.backend if is_hip(): # The scaling factor convention we are assuming is # quantized_value * scaling_factor ~= true_value # which is consistent with the practice of setting # scaling_factor = tensor_amax / FPtype_max scaling_factor *= 2 - layer_paged_attn.kv_cache_scaling_factor = scaling_factor - - # Should not be called unless the KV cache dtype is FP8 on ROCm (AMD GPU) - def load_dummy_kv_cache_scales(self) -> None: - for layer_idx in range(self.model.config.num_hidden_layers): - layer_paged_attn = self.model.layers[layer_idx].self_attn.attn - setattr(layer_paged_attn, "kv_cache_scaling_factor", 1.0) + if hasattr(layer_paged_attn, "kv_cache_scaling_factor"): + layer_paged_attn.kv_cache_scaling_factor = scaling_factor + else: + raise RuntimeError("PagedAttention has no KV cache scaling " + "factor attribute!") diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index 2a1dc7566a923..3405556bbd116 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -294,7 +294,7 @@ def kv_cache_scales_loader(filename: str, f"scaling factors belonging to different model type {schema['model_type']}!" assert isinstance(schema["kv_cache"], dict), malformed_schema_str assert schema["kv_cache"]["dtype"] == "float8_e4m3fn", "Loaded scaling factors intended " \ - f"for KV cache dtype = {schema['kv_cache']['dtype']} rather than FP8!" + f"for KV cache dtype = {schema['kv_cache']['dtype']} rather than float8_e4m3fn!" assert isinstance(schema["kv_cache"]["scaling_factor"], dict), malformed_schema_str raw_rank_scales_map = schema["kv_cache"]["scaling_factor"] # The keys in raw_rank_scales_map should be strings with the format diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 5afe11d38f1d0..8c71b9e7f3e10 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -124,10 +124,6 @@ def load_model(self) -> None: raise RuntimeError("Using FP8 KV cache and scaling factors provided but " f"model {self.model.__class__} does not support loading " "scaling factors.") - elif callable(getattr(self.model, "load_dummy_kv_cache_scales", None)): - logger.warn(f"Using FP8 KV cache but no scaling factors provided. Defaulting to " - "scaling factors of 1.0, This may lead to less accurate results!") - self.model.load_dummy_kv_cache_scales() else: logger.warn(f"Using FP8 KV cache but no scaling factors provided. Defaulting to " "scaling factors of 1.0, This may lead to less accurate results!") From dd469df6bb1e518c0c3a6ecb2f83dcb8a70a4b7d Mon Sep 17 00:00:00 2001 From: AdrianAbeyta Date: Fri, 8 Mar 2024 22:36:30 +0000 Subject: [PATCH 132/159] Add model flag as example option --- tests/fp8_kv/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fp8_kv/README.md b/tests/fp8_kv/README.md index c60bddc1151c0..0c0142f9d41fd 100644 --- a/tests/fp8_kv/README.md +++ b/tests/fp8_kv/README.md @@ -92,5 +92,5 @@ optional arguments: ``` ``` Example: -python3 benchmarks/benchmark_throughput.py --input-len --output-len -tp --kv-cache-dtype fp8 --scales-path +python3 benchmarks/benchmark_throughput.py --input-len --output-len -tp --kv-cache-dtype fp8 --scales-path --model ```python From 2f60ad72815d3e84fb2250097aa6e1d77de567b0 Mon Sep 17 00:00:00 2001 From: AdrianAbeyta Date: Wed, 13 Mar 2024 03:47:34 +0000 Subject: [PATCH 133/159] Fix ruff syntax errors --- .../fp8/amd_detail/hip_float8_impl.h | 6 +- tests/fp8_kv/README.md | 2 +- tests/kernels/test_attention.py | 1 + tests/kernels/test_cache.py | 31 ++++--- tests/test_cache_block_hashing.py | 4 +- vllm/config.py | 18 +++-- vllm/engine/arg_utils.py | 22 ++--- .../layers/attention/backends/xformers.py | 71 +--------------- .../layers/attention/ops/paged_attn.py | 2 +- vllm/model_executor/models/llama.py | 7 +- vllm/model_executor/weight_utils.py | 80 ++++++++++++------- vllm/worker/cache_engine.py | 1 - vllm/worker/model_runner.py | 15 ++-- 13 files changed, 120 insertions(+), 140 deletions(-) diff --git a/csrc/quantization/fp8/amd_detail/hip_float8_impl.h b/csrc/quantization/fp8/amd_detail/hip_float8_impl.h index c88fbd913c2ee..e05905b4e49e8 100644 --- a/csrc/quantization/fp8/amd_detail/hip_float8_impl.h +++ b/csrc/quantization/fp8/amd_detail/hip_float8_impl.h @@ -122,7 +122,7 @@ HIP_FP8_HOST_DEVICE uint8_t to_float8(T _x, bool stoch = false, uint32_t rng = 0 } // First need to check if it is normal or denorm as there is a difference of - // implict 1 Then need to adjust the exponent to align with the F8 exponent, + // implicit 1 Then need to adjust the exponent to align with the F8 exponent, // in the meanwhile, shift The mantissa. Then for stochastic rounding, add rng // to mantissa and truncate. And for RNE, no need to add rng. Then probably // need to check whether there is carry and adjust exponent and mantissa again @@ -152,7 +152,7 @@ where exponent==0 (actual exponent -14) and highest bit of mantissa is 1 are bf8 if (act_exponent <= f8_denormal_act_exponent) { /* This is the case where fp32/fp16 is normal but it is in f8 denormal range. For example fp8 nanoo mode, denormal exponent is -7, but if the - fp32/fp16 actual exponent is -7, it is actually larger due to the implict 1, + fp32/fp16 actual exponent is -7, it is actually larger due to the implicit 1, Therefore it needs to be adjust to -6 and mantissa shift right by 1. So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */ exponent_diff = f8_denormal_act_exponent - act_exponent; @@ -179,7 +179,7 @@ where exponent==0 (actual exponent -14) and highest bit of mantissa is 1 are bf8 mantissa <<= -exponent_diff; } bool implicit_one = mantissa & (1 << mfmt); - // if there is no implict 1, it means the f8 is denormal and need to adjust + // if there is no implicit 1, it means the f8 is denormal and need to adjust // to denorm exponent f8_exponent = (act_exponent + exponent_diff) /*actual f8 exponent*/ + f8_bias - (implicit_one ? 0 : 1); diff --git a/tests/fp8_kv/README.md b/tests/fp8_kv/README.md index 0c0142f9d41fd..59acfb02b1972 100644 --- a/tests/fp8_kv/README.md +++ b/tests/fp8_kv/README.md @@ -36,7 +36,7 @@ The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found [ # prerequisites: # - Quantized HF LLaMa 2 model python3 3rdparty/quantizer/extract_scales.py --help -Useage: extract_scales.py [-h] --quantized_model QUANTIZED_MODEL [--load_format {auto,safetensors,npz,pt}] [--output_dir OUTPUT_DIR] [--output_name OUTPUT_NAME] [--tp_size TP_SIZE] +Usage: extract_scales.py [-h] --quantized_model QUANTIZED_MODEL [--load_format {auto,safetensors,npz,pt}] [--output_dir OUTPUT_DIR] [--output_name OUTPUT_NAME] [--tp_size TP_SIZE] KV Scale Extraction Example diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index d19fc4720d376..854f819d03e49 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -143,6 +143,7 @@ def test_paged_attention( num_query_heads, num_kv_heads = num_heads query = torch.empty(num_seqs, num_query_heads, head_size, dtype=dtype) query.uniform_(-scale, scale) + gpu_id = f"cuda:{device}" assert num_query_heads % num_kv_heads == 0 num_queries_per_kv = num_query_heads // num_kv_heads diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 7f87a8e31af1b..2f7f7b834e1fe 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -33,7 +33,7 @@ @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @torch.inference_mode() def test_copy_blocks( @@ -123,6 +123,8 @@ def test_reshape_and_cache( if torch.cuda.is_available(): torch.cuda.manual_seed(seed) torch.set_default_device(device) + gpu_id = f"cuda:{device}" + # Create a random slot mapping. num_slots = block_size * num_blocks slot_mapping = random.sample(range(num_slots), num_tokens) @@ -133,8 +135,9 @@ def test_reshape_and_cache( # Create the KV caches. key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1, - num_heads, head_size, kv_cache_dtype, - dtype, seed, gpu_id) + num_heads, head_size, + kv_cache_dtype, dtype, + seed, gpu_id) key_cache, value_cache = key_caches[0], value_caches[0] # Clone the KV caches. @@ -173,8 +176,10 @@ def test_reshape_and_cache( cloned_value_cache[block_idx, :, :, block_offset] = value[i] if kv_cache_dtype == "fp8": - assert torch.allclose(result_key_cache, cloned_key_cache, atol=0.001, rtol=0.1) - assert torch.allclose(result_value_cache, cloned_value_cache, atol=0.001, rtol=0.1) + assert torch.allclose(result_key_cache, cloned_key_cache, + atol=0.001, rtol=0.1) + assert torch.allclose(result_value_cache, cloned_value_cache, + atol=0.001, rtol=0.1) else: assert torch.allclose(key_cache, cloned_key_cache) assert torch.allclose(value_cache, cloned_value_cache) @@ -225,14 +230,18 @@ def test_swap_blocks( block_mapping = dict(zip(src_blocks, dst_blocks)) # Create the KV caches on the first device. - src_key_caches, src_value_caches = kv_cache_factory( - num_blocks, block_size, 1, num_heads, head_size, kv_cache_dtype, dtype, seed, - src_device) + src_key_caches, src_value_caches = kv_cache_factory(num_blocks, + block_size, 1, + num_heads, head_size, + kv_cache_dtype, dtype, + seed, src_device) # Create the KV caches on the second device. - dist_key_caches, dist_value_caches = kv_cache_factory( - num_blocks, block_size, 1, num_heads, head_size, kv_cache_dtype, dtype, seed, - dst_device) + dist_key_caches, dist_value_caches = kv_cache_factory(num_blocks, + block_size, 1, + num_heads, head_size, + kv_cache_dtype, dtype, + seed, dst_device) src_key_caches_clone = src_key_caches[0].clone() src_value_caches_clone = src_value_caches[0].clone() diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index fb541f38f3489..2f609d2deb93f 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -74,11 +74,9 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int, tokenizer.tokenizer.eos_token_id, lora_request) num_blocks = len(prompt_token_ids) // block_size - for idx in range(num_blocks): + for seq_id, idx in enumerate(range(num_blocks)): hashes[-1][-1].append(seq.hash_of_block(idx)) - seq_id += 1 - # Check that hashes made with two prefixes with different first blocks are # different everywhere. for hash0, hash1 in zip(flatten_2d(hashes[0]), flatten_2d(hashes[1])): diff --git a/vllm/config.py b/vllm/config.py index 820ff0e1deea1..3f85bac1e4fee 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -33,8 +33,8 @@ class ModelConfig: scales_path: Path to JSON file containing scaling factors. Used to load KV cache scaling factors into the model when KV cache type is FP8_E4M3 on ROCm (AMD GPU). In the future these will also be used - to load activation and weight scaling factors when the model dtype is - FP8_E4M3 on ROCm. + to load activation and weight scaling factors when the model dtype + is FP8_E4M3 on ROCm. load_format: The format of the model weights to load: "auto" will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is @@ -350,14 +350,16 @@ def _verify_cache_dtype(self) -> None: nvcc_cuda_version = get_nvcc_cuda_version() if nvcc_cuda_version < Version("11.8"): raise ValueError( - "FP8 is not supported when cuda version is lower than 11.8." + "FP8 is not supported when cuda version is" + "lower than 11.8." ) logger.info( - "Using fp8 data type to store kv cache. It reduces " - "the GPU memory footprint and boosts the performance. " - "But it may cause slight accuracy drop without scaling factors. " - "FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8." - "On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.") + "Using fp8 data type to store kv cache. It reduces the GPU " + "memory footprint and boosts the performance. " + "But it may cause slight accuracy drop without scaling " + "factors. FP8_E5M2 (without scaling) is only supported on " + "cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 " + "is instead supported for common inference criteria.") else: raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}") diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 16258f4c63c2d..815247b01b6be 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -141,18 +141,21 @@ def add_cli_args( type=str, choices=['auto', 'fp8'], default='auto', - help='Data type for kv cache storage. If "auto", will use model data type. ' - 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' - 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') + help='Data type for kv cache storage. If "auto", will use model ' + 'data type. FP8_E5M2 (without scaling) is only supported on cuda ' + 'version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead ' + 'supported for common inference criteria. ') parser.add_argument( '--scales-path', type=str, default=None, - help='Path to the JSON file containing the KV cache scaling factors. ' - 'This should generally be supplied, when KV cache dtype is FP8. Otherwise, ' - 'KV cache scaling factors default to 1.0, which may cause accuracy issues. ' - 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' - 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') + help='Path to the JSON file containing the KV cache ' + 'scaling factors. This should generally be supplied, when ' + 'KV cache dtype is FP8. Otherwise, KV cache scaling factors ' + 'default to 1.0, which may cause accuracy issues. ' + 'FP8_E5M2 (without scaling) is only supported on cuda version' + 'greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead ' + 'supported for common inference criteria. ') parser.add_argument('--max-model-len', type=int, default=EngineArgs.max_model_len, @@ -314,7 +317,8 @@ def create_engine_configs( device_config = DeviceConfig(self.device) model_config = ModelConfig( self.model, self.tokenizer, self.tokenizer_mode, - self.trust_remote_code, self.download_dir, self.scales_path, self.load_format, + self.trust_remote_code, self.download_dir, + self.scales_path, self.load_format, self.dtype, self.seed, self.revision, self.code_revision, self.tokenizer_revision, self.max_model_len, self.quantization, self.enforce_eager, self.max_context_len_to_capture, diff --git a/vllm/model_executor/layers/attention/backends/xformers.py b/vllm/model_executor/layers/attention/backends/xformers.py index 4b5010c1e2085..ffe7e36569f83 100644 --- a/vllm/model_executor/layers/attention/backends/xformers.py +++ b/vllm/model_executor/layers/attention/backends/xformers.py @@ -10,7 +10,6 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.attention.ops.paged_attn import ( PagedAttentionImpl) -from vllm._C import ops from vllm.utils import is_hip @@ -90,7 +89,8 @@ def forward( # profiling run. if key_cache is not None and value_cache is not None: PagedAttentionImpl.reshape_and_cache(key, value, key_cache, - value_cache, input_metadata, self.kv_cache_scaling_factor,) + value_cache, input_metadata, + self.kv_cache_scaling_factor,) if input_metadata.is_prompt: # Prompt run. if (key_cache is None or value_cache is None @@ -171,7 +171,8 @@ def forward( else: # prefix-enabled attention - # TODO(Hai) this triton kernel has regression issue with FP8 KVCache to handle mixed types + # TODO(Hai) this triton kernel has regression issue with + # FP8 KVCache to handle mixed types output = PagedAttentionImpl.forward_prefix( query, key, @@ -267,67 +268,3 @@ def _ref_masked_attention( attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype) out = torch.einsum("hqk,khd->qhd", attn_weights, value) return out - block_size = value_cache.shape[3] - num_seqs, num_heads, head_size = query.shape - max_num_partitions = ( - (input_metadata.max_context_len + _PARTITION_SIZE - 1) // - _PARTITION_SIZE) - # NOTE(woosuk): We use a simple heuristic to decide whether to use - # PagedAttention V1 or V2. If the number of partitions is 1, we use - # V1 to avoid the overhead of reduction. Also, if the number of - # sequences or heads is large, we use V1 since there is enough work - # to parallelize. - # TODO(woosuk): Tune this heuristic. - # For context len > 8192, use V2 kernel to avoid shared memory shortage. - use_v1 = input_metadata.max_context_len <= 8192 and ( - max_num_partitions == 1 or num_seqs * num_heads > 512) - if use_v1: - # Run PagedAttention V1. - ops.paged_attention_v1( - output, - query, - key_cache, - value_cache, - num_kv_heads, - scale, - input_metadata.block_tables, - input_metadata.context_lens, - block_size, - input_metadata.max_context_len, - alibi_slopes, - input_metadata.kv_cache_dtype, - kv_scale, - ) - else: - # Run PagedAttention V2. - assert _PARTITION_SIZE % block_size == 0 - tmp_output = torch.empty( - size=(num_seqs, num_heads, max_num_partitions, head_size), - dtype=output.dtype, - device=output.device, - ) - exp_sums = torch.empty( - size=(num_seqs, num_heads, max_num_partitions), - dtype=torch.float32, - device=output.device, - ) - max_logits = torch.empty_like(exp_sums) - ops.paged_attention_v2( - output, - exp_sums, - max_logits, - tmp_output, - query, - key_cache, - value_cache, - num_kv_heads, - scale, - input_metadata.block_tables, - input_metadata.context_lens, - block_size, - input_metadata.max_context_len, - alibi_slopes, - input_metadata.kv_cache_dtype, - kv_scale, - ) - return output diff --git a/vllm/model_executor/layers/attention/ops/paged_attn.py b/vllm/model_executor/layers/attention/ops/paged_attn.py index 2dfa77b94124b..131b900fe5108 100644 --- a/vllm/model_executor/layers/attention/ops/paged_attn.py +++ b/vllm/model_executor/layers/attention/ops/paged_attn.py @@ -46,7 +46,7 @@ def forward_decode( num_kv_heads: int, scale: float, alibi_slopes: Optional[torch.Tensor], - kv_scale: float + kv_scale: float, ) -> torch.Tensor: output = torch.empty_like(query) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 0f31af51abb79..f50cd39271106 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -48,7 +48,6 @@ hf_model_weights_iterator, kv_cache_scales_loader) from vllm.sequence import SamplerOutput -from vllm.config import LoRAConfig from vllm.utils import is_hip KVCache = Tuple[torch.Tensor, torch.Tensor] @@ -405,7 +404,11 @@ def load_kv_cache_scales(self, scales_path: str) -> None: scales_path, tp_rank, tp_size, self.config.num_hidden_layers, self.config.__class__.model_type): - layer_paged_attn = self.model.layers[layer_idx].self_attn.attn.backend + layer_paged_attn = ( + self.model.layers[layer_idx]. + self_attn.attn.backend + ) + if is_hip(): # The scaling factor convention we are assuming is # quantized_value * scaling_factor ~= true_value diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index 7c2dec8e57082..124269a6ffe13 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -268,8 +268,9 @@ def hf_model_weights_iterator( def kv_cache_scales_loader(filename: str, tp_rank: int, tp_size: int, - num_hidden_layers, - model_type: Optional[str]) -> Iterable[Tuple[int, float]]: + num_hidden_layers: int, + model_type: Optional[str] + )-> Iterable[Tuple[int, float]]: """ A simple utility to read in KV cache scaling factors that have been previously serialized to disk. Used by the model to populate the appropriate @@ -290,44 +291,66 @@ def kv_cache_scales_loader(filename: str, schema = json.load(f, parse_int=int, parse_constant=float) malformed_schema_str = "Malformed schema detected." - # If any of the inputs are malformed or mismatched, it raises an error - # somewhere in the following lines and is caught in except + # If any of the inputs are malformed or mismatched, + # it raises an error somewhere in the following + # lines and is caught in except. assert isinstance(schema, dict), malformed_schema_str if schema["model_type"] is not None: - assert model_type == schema["model_type"], f"Model type is {model_type} but loaded " \ - f"scaling factors belonging to different model type {schema['model_type']}!" + assert model_type == schema["model_type"],( + f"Model type is {model_type} but loaded " \ + f"scaling factors belonging to different " \ + f"model type {schema['model_type']}!" ) assert isinstance(schema["kv_cache"], dict), malformed_schema_str - assert schema["kv_cache"]["dtype"] == "float8_e4m3fn", "Loaded scaling factors intended " \ - f"for KV cache dtype = {schema['kv_cache']['dtype']} rather than float8_e4m3fn!" - assert isinstance(schema["kv_cache"]["scaling_factor"], dict), malformed_schema_str + assert schema["kv_cache"]["dtype"] == "float8_e4m3fn",( + f"Loaded scaling factors intended " \ + f"for KV cache dtype = {schema['kv_cache']['dtype']}" \ + f"rather than float8_e4m3fn!") + if not isinstance(schema["kv_cache"]["scaling_factor"], dict): + raise AssertionError(malformed_schema_str) raw_rank_scales_map = schema["kv_cache"]["scaling_factor"] # The keys in raw_rank_scales_map should be strings with the format - # f"{rank_keyword}{tp_rank}", where rank_keyword is an alphabetical string shared - # amongst all keys and tp_rank is a numeric string. Thus, recovering the alphabetical + # f"{rank_keyword}{tp_rank}", where rank_keyword is an + # alphabetical string shared amongst all keys and tp_rank + # is a numeric string. Thus, recovering the alphabetical # components of any key should return rank_keyword rank_keyword = "".join(char for char in next(iter(raw_rank_scales_map.keys())) if char.isalpha()) - rank_scales_map = {int(rank.replace(rank_keyword, "")) : scales_map - for rank, scales_map in raw_rank_scales_map.items()} - assert len(rank_scales_map) != 0, "Loaded KV scales dictionary is empty." + rank_scales_map = { + int(rank.replace(rank_keyword, "")) : scales_map + for rank, scales_map in raw_rank_scales_map.items()} + assert len(rank_scales_map) != 0, \ + "Loaded KV scales dictionary is empty." loaded_tp_size = max(rank_scales_map.keys()) + 1 - assert loaded_tp_size == tp_size, f"Loaded dictionary has TP size {loaded_tp_size} " \ - f"but LLM engine is currently running with TP size {tp_size}." + assert loaded_tp_size == tp_size, ( + f"Loaded dictionary has TP size {loaded_tp_size} " \ + f"but LLM engine is currently running with TP size {tp_size}." + ) for rank, scales_map in rank_scales_map.items(): assert isinstance(scales_map, dict), malformed_schema_str - assert len(scales_map) == num_hidden_layers, "KV cache scales map for TP rank " \ - f"{rank} is malformed. Expected {num_hidden_layers} layers, got {len(scales_map)}." + assert len(scales_map) == num_hidden_layers, ( + f"KV cache scales map for TP rank {rank} is malformed." \ + f"Expected {num_hidden_layers} layers, " + f"got {len(scales_map)}." + ) for i in range(tp_size): - assert i in rank_scales_map, f"KV cache scales map for TP rank {i} not found." - assert tp_rank in rank_scales_map, "Tried to load KV cache scales for TP rank " \ - f"{tp_rank} but these were not found." - assert isinstance(rank_scales_map[tp_rank], dict), malformed_schema_str - layer_scales_map = {int(layer_idx): float(scale) - for layer_idx, scale in rank_scales_map[tp_rank].items()} + assert i in rank_scales_map, ( + f"KV cache scales map for TP rank {i} not found." + ) + assert tp_rank in rank_scales_map, ( + "Tried to load KV cache scales for TP rank " \ + f"{tp_rank} but these were not found." + ) + scales_map = rank_scales_map.get(tp_rank) + assert isinstance(scales_map, dict), malformed_schema_str + layer_scales_map = { + int(layer_idx): float(scale) + for layer_idx, scale in rank_scales_map[tp_rank].items()} for i in range(num_hidden_layers): - assert i in layer_scales_map, "Could not find KV cache scales for layer " \ - f"{i} in TP rank {tp_rank}." + assert i in layer_scales_map, ( + "Could not find KV cache scales for layer " \ + f"{i} in TP rank {tp_rank}." + ) return layer_scales_map.items() except FileNotFoundError: @@ -339,8 +362,9 @@ def kv_cache_scales_loader(filename: str, # This section is reached if and only if any of the excepts are hit # Return an empty iterable (tuple) => no KV cache scales are loaded # which effectively defaults to 1.0 scales - logger.warn(f"Defaulting to KV cache scaling factors = 1.0 for all layers in TP rank {tp_rank}" - " as an error occurred during loading.") + logger.warn("Defaulting to KV cache scaling factors = 1.0 " + f"for all layers in TP rank {tp_rank} " + "as an error occurred during loading.") return () diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 3adf628394e7b..2c8c7bae3c9c1 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -7,7 +7,6 @@ from vllm.config import CacheConfig, ModelConfig, ParallelConfig from vllm.logger import init_logger from vllm.utils import in_wsl, is_neuron, STR_DTYPE_TO_TORCH_DTYPE -from vllm.utils import in_wsl, STR_DTYPE_TO_TORCH_DTYPE logger = init_logger(__name__) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index e60daef1bc679..9ca32d280677f 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -118,14 +118,17 @@ def load_model(self) -> None: if callable(getattr(self.model, "load_kv_cache_scales", None)): self.model.load_kv_cache_scales(self.model_config.scales_path) else: - raise RuntimeError("Using FP8 KV cache and scaling factors provided but " - f"model {self.model.__class__} does not support loading " - "scaling factors.") + raise RuntimeError("Using FP8 KV cache and scaling " + "factors provided but model " + f"{self.model.__class__} does not " + "support loading scaling factors.") else: - logger.warn(f"Using FP8 KV cache but no scaling factors provided. Defaulting to " - "scaling factors of 1.0, This may lead to less accurate results!") + logger.warn("Using FP8 KV cache but no scaling factors " + "provided. Defaulting to scaling factors of 1.0. " + "This may lead to less accurate results!") elif self.model_config.scales_path is not None: - logger.warn("KV cache scaling factors provided, but the KV cache data type is not FP8. " + logger.warn("KV cache scaling factors provided, " + "but the KV cache data type is not FP8. " "KV cache scaling factors will not be used.") From 94c2e7c6ccd949f015ab26e94b08e92b23df20ed Mon Sep 17 00:00:00 2001 From: AdrianAbeyta Date: Wed, 13 Mar 2024 22:26:07 +0000 Subject: [PATCH 134/159] Update model config for scales path --- benchmarks/benchmark_throughput.py | 4 ++-- docs/source/index.rst | 3 ++- vllm/config.py | 14 +++++++------- vllm/engine/arg_utils.py | 8 ++++---- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index b46a110a9fb4a..69879cea28f4d 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -72,7 +72,7 @@ def run_vllm( max_model_len: Optional[int], enforce_eager: bool, kv_cache_dtype: str, - kv_cache_scales_path: Optional[str], + scales_path: Optional[str], device: str, enable_prefix_caching: bool, gpu_memory_utilization: float = 0.9, @@ -89,7 +89,7 @@ def run_vllm( gpu_memory_utilization=gpu_memory_utilization, enforce_eager=enforce_eager, kv_cache_dtype=kv_cache_dtype, - scales_path=kv_cache_scales_path, + scales_path=scales_path, device=device, enable_prefix_caching=enable_prefix_caching) diff --git a/docs/source/index.rst b/docs/source/index.rst index 65bfbbabf8be1..984092c037ac2 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -92,7 +92,8 @@ Documentation :caption: Quantization quantization/auto_awq - quantization/fp8_e5m2_kv_cache + quantization/fp8_e5m2_kvcache + quantization/fp8/amd_detail .. toctree:: :maxdepth: 2 diff --git a/vllm/config.py b/vllm/config.py index b343027b71d19..4c2743942745c 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -30,11 +30,6 @@ class ModelConfig: downloading the model and tokenizer. download_dir: Directory to download and load the weights, default to the default cache directory of huggingface. - scales_path: Path to JSON file containing scaling factors. Used to load - KV cache scaling factors into the model when KV cache type - is FP8_E4M3 on ROCm (AMD GPU). In the future these will also be used - to load activation and weight scaling factors when the model dtype - is FP8_E4M3 on ROCm. load_format: The format of the model weights to load: "auto" will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is @@ -62,6 +57,11 @@ class ModelConfig: output). If None, will be derived from the model. quantization: Quantization method that was used to quantize the model weights. If None, we assume the model weights are not quantized. + scales_path: Path to JSON file containing scaling factors. Used to load + KV cache scaling factors into the model when KV cache type + is FP8_E4M3 on ROCm (AMD GPU). In the future these will also be used + to load activation and weight scaling factors when the model dtype + is FP8_E4M3 on ROCm. enforce_eager: Whether to enforce eager execution. If True, we will disable CUDA graph and always execute the model in eager mode. If False, we will use CUDA graph and eager execution in hybrid. @@ -77,7 +77,6 @@ def __init__( tokenizer_mode: str, trust_remote_code: bool, download_dir: Optional[str], - scales_path: Optional[str], load_format: str, dtype: Union[str, torch.dtype], seed: int, @@ -86,6 +85,7 @@ def __init__( tokenizer_revision: Optional[str] = None, max_model_len: Optional[int] = None, quantization: Optional[str] = None, + scales_path: Optional[str] = None, enforce_eager: bool = False, max_context_len_to_capture: Optional[int] = None, max_logprobs: int = 5, @@ -95,7 +95,6 @@ def __init__( self.tokenizer_mode = tokenizer_mode self.trust_remote_code = trust_remote_code self.download_dir = download_dir - self.scales_path = scales_path self.load_format = load_format self.dtype = dtype self.seed = seed @@ -103,6 +102,7 @@ def __init__( self.code_revision = code_revision self.tokenizer_revision = tokenizer_revision self.quantization = quantization + self.scales_path = scales_path self.enforce_eager = enforce_eager self.max_context_len_to_capture = max_context_len_to_capture self.max_logprobs = max_logprobs diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 815247b01b6be..41118a0df1fe9 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -18,7 +18,7 @@ class EngineArgs: load_format: str = 'auto' dtype: str = 'auto' kv_cache_dtype: str = 'auto' - scales_path: str = None + scales_path: Optional[str] = None seed: int = 0 max_model_len: Optional[int] = None worker_use_ray: bool = False @@ -318,11 +318,11 @@ def create_engine_configs( model_config = ModelConfig( self.model, self.tokenizer, self.tokenizer_mode, self.trust_remote_code, self.download_dir, - self.scales_path, self.load_format, - self.dtype, self.seed, self.revision, self.code_revision, + self.load_format, self.dtype, self.seed, + self.revision, self.code_revision, self.tokenizer_revision, self.max_model_len, self.quantization, self.enforce_eager, self.max_context_len_to_capture, - self.max_logprobs) + self.scales_path, self.max_logprobs) cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, From a350641ff2fc63c61602f475632f16cc5db0ad41 Mon Sep 17 00:00:00 2001 From: AdrianAbeyta Date: Wed, 13 Mar 2024 23:03:37 +0000 Subject: [PATCH 135/159] Add .rst for fp8_e4m3_kvcache and rename fp8_kvcache to fp8_e5m2 --- docs/source/index.rst | 2 +- docs/source/quantization/fp8_e4m3_kvcache.rst | 19 +++++++++++++++++++ ...{fp8_kv_cache.rst => fp8_e5m2_kvcache.rst} | 0 3 files changed, 20 insertions(+), 1 deletion(-) create mode 100644 docs/source/quantization/fp8_e4m3_kvcache.rst rename docs/source/quantization/{fp8_kv_cache.rst => fp8_e5m2_kvcache.rst} (100%) diff --git a/docs/source/index.rst b/docs/source/index.rst index 984092c037ac2..506088b6d4034 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -93,7 +93,7 @@ Documentation quantization/auto_awq quantization/fp8_e5m2_kvcache - quantization/fp8/amd_detail + quantization/fp8_e4m3_kvcache .. toctree:: :maxdepth: 2 diff --git a/docs/source/quantization/fp8_e4m3_kvcache.rst b/docs/source/quantization/fp8_e4m3_kvcache.rst new file mode 100644 index 0000000000000..a45abf974be41 --- /dev/null +++ b/docs/source/quantization/fp8_e4m3_kvcache.rst @@ -0,0 +1,19 @@ +.. _fp8_e4m3_kvcache: + +FP8 E4M3 KV Cache +================== + +The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits. +The FP8 data format retains 3 mantissa bits and can convert float/fp16/bflaot16 and fp8 to each other. + +Here is an example of how to enable this feature: + +.. code-block:: python + + + from vllm import LLM, SamplingParams + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + llm = LLM(model="/data/models/llama-2-70b-chat-hf", kv_cache_dtype="fp8", scales_path="./tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json") + prompt = "London is the capital of" + out = llm.generate(prompt, sampling_params)[0].outputs[0].text + print(out) diff --git a/docs/source/quantization/fp8_kv_cache.rst b/docs/source/quantization/fp8_e5m2_kvcache.rst similarity index 100% rename from docs/source/quantization/fp8_kv_cache.rst rename to docs/source/quantization/fp8_e5m2_kvcache.rst From eb8e3d8f25d9b547074d2f5df688de9d677fb65e Mon Sep 17 00:00:00 2001 From: AdrianAbeyta Date: Thu, 14 Mar 2024 20:50:19 +0000 Subject: [PATCH 136/159] Skip fp8 UT test on CUDA for e4m3 --- tests/kernels/test_cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 2f7f7b834e1fe..b270d935924c1 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -257,7 +257,7 @@ def test_swap_blocks( assert torch.allclose(src_value_caches_clone[src].cpu(), dist_value_caches[0][dst].cpu()) - +@pytest.mark.skipif(not is_hip(), reason="FP8 conversion test requires e4m3") @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("block_size", BLOCK_SIZES) From f9eba0cfedfb5767b7f85f15c7406ba782e1e26c Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Thu, 14 Mar 2024 22:56:57 +0000 Subject: [PATCH 137/159] Fix device id formatting --- tests/kernels/test_attention.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 854f819d03e49..55f6a953e9e22 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -143,7 +143,6 @@ def test_paged_attention( num_query_heads, num_kv_heads = num_heads query = torch.empty(num_seqs, num_query_heads, head_size, dtype=dtype) query.uniform_(-scale, scale) - gpu_id = f"cuda:{device}" assert num_query_heads % num_kv_heads == 0 num_queries_per_kv = num_query_heads // num_kv_heads @@ -238,14 +237,14 @@ def test_paged_attention( block_size, x) dequantized_key_cache = torch.empty(size=key_cache_shape, dtype=dtype, - device=gpu_id) + device=device) cache_ops.convert_fp8(key_cache, dequantized_key_cache) key_cache = dequantized_key_cache value_cache_shape = value_cache.shape dequantized_value_cache = torch.empty(size=value_cache_shape, dtype=dtype, - device=gpu_id) + device=device) cache_ops.convert_fp8(value_cache, dequantized_value_cache) value_cache = dequantized_value_cache From db8f29c636e3ad290001f1ab1c296f1ee1ee980d Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Fri, 15 Mar 2024 00:05:54 +0000 Subject: [PATCH 138/159] Fix scales_path location --- vllm/engine/arg_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 41118a0df1fe9..b2a6227fb4054 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -321,8 +321,8 @@ def create_engine_configs( self.load_format, self.dtype, self.seed, self.revision, self.code_revision, self.tokenizer_revision, self.max_model_len, self.quantization, - self.enforce_eager, self.max_context_len_to_capture, - self.scales_path, self.max_logprobs) + self.scales_path, self.enforce_eager, + self.max_context_len_to_capture, self.max_logprobs) cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, From 49d1593ebdc51cc14c82b725a085fd829d13513f Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Fri, 15 Mar 2024 00:11:06 +0000 Subject: [PATCH 139/159] Fix yapf formatting --- 3rdparty/quantizer/extract_scales.py | 190 +++++++++++------- 3rdparty/quantizer/quantize.py | 27 +-- benchmarks/benchmark_latency.py | 9 +- benchmarks/benchmark_throughput.py | 23 ++- .../kernels/benchmark_paged_attention.py | 6 +- setup.py | 7 +- tests/kernels/test_cache.py | 39 ++-- vllm/config.py | 3 +- vllm/engine/arg_utils.py | 7 +- .../layers/attention/backends/xformers.py | 15 +- .../layers/attention/ops/paged_attn.py | 13 +- vllm/model_executor/models/llama.py | 14 +- vllm/model_executor/weight_utils.py | 60 +++--- vllm/utils.py | 4 +- vllm/worker/model_runner.py | 10 +- 15 files changed, 235 insertions(+), 192 deletions(-) diff --git a/3rdparty/quantizer/extract_scales.py b/3rdparty/quantizer/extract_scales.py index 30b3354a9b76c..c19396796ffcc 100644 --- a/3rdparty/quantizer/extract_scales.py +++ b/3rdparty/quantizer/extract_scales.py @@ -20,8 +20,9 @@ def _prepare_hf_weights( fall_back_to_pt: bool = True, ) -> Tuple[str, List[str], bool]: if not os.path.isdir(quantized_model_dir): - raise FileNotFoundError(f"The quantized model directory `{quantized_model_dir}` " - "does not exist.") + raise FileNotFoundError( + f"The quantized model directory `{quantized_model_dir}` " + "does not exist.") use_safetensors = False # Some quantized models use .pt files for storing the weights. if load_format == "auto": @@ -40,7 +41,8 @@ def _prepare_hf_weights( hf_weights_files: List[str] = [] for pattern in allow_patterns: - hf_weights_files += glob.glob(os.path.join(quantized_model_dir, pattern)) + hf_weights_files += glob.glob( + os.path.join(quantized_model_dir, pattern)) if len(hf_weights_files) > 0: if pattern == "*.safetensors": use_safetensors = True @@ -69,7 +71,7 @@ def _prepare_hf_weights( # Adapted from vllm/model_executor/weight_utils.py -def _hf_tensorfile_iterator(filename: str, load_format: str, +def _hf_tensorfile_iterator(filename: str, load_format: str, use_safetensors: bool): if load_format == "npz": assert not use_safetensors @@ -90,10 +92,11 @@ def _hf_tensorfile_iterator(filename: str, load_format: str, torch.cuda.empty_cache() -def _kv_scales_extractor(hf_tensor_files: Iterable[str], - use_safetensors: bool, - rank_keyword: str = "rank", - expected_tp_size: Optional[int] = None) -> Dict[int, Dict[int, float]]: +def _kv_scales_extractor( + hf_tensor_files: Iterable[str], + use_safetensors: bool, + rank_keyword: str = "rank", + expected_tp_size: Optional[int] = None) -> Dict[int, Dict[int, float]]: """ Given a list of files containing tensor data, attempt to extract KV cache scales from these files. Intended as a helper function taking in the output from _prepare_hf_weights. @@ -107,7 +110,8 @@ def _kv_scales_extractor(hf_tensor_files: Iterable[str], respective per-layer scaling factor. """ for char in rank_keyword: - assert not char.isdecimal(), f"Rank keyword {rank_keyword} contains a numeric character!" + assert not char.isdecimal( + ), f"Rank keyword {rank_keyword} contains a numeric character!" rank_scales_map = {} for tensor_file in hf_tensor_files: try: @@ -115,7 +119,8 @@ def _kv_scales_extractor(hf_tensor_files: Iterable[str], if rank_idx != -1: start_idx = rank_idx + len(rank_keyword) stop_idx = start_idx - while stop_idx < len(tensor_file) and tensor_file[stop_idx].isdecimal(): + while stop_idx < len( + tensor_file) and tensor_file[stop_idx].isdecimal(): stop_idx += 1 if stop_idx == start_idx: raise RuntimeError("Did not find rank # in filename.") @@ -125,37 +130,47 @@ def _kv_scales_extractor(hf_tensor_files: Iterable[str], # that it's intended for TP rank 0 rank = 0 else: - raise RuntimeError(f"Filename does not contain '{rank_keyword}'.") + raise RuntimeError( + f"Filename does not contain '{rank_keyword}'.") except RuntimeError: print("Unable to determine TP rank " f"corresponding to file '{tensor_file}'") raise - + if rank not in rank_scales_map: layer_scales_map = {} rank_scales_map[rank] = layer_scales_map else: - raise RuntimeError(f"Tensor file '{tensor_file}' shares TP rank {rank} " - "with another tensor file.") - + raise RuntimeError( + f"Tensor file '{tensor_file}' shares TP rank {rank} " + "with another tensor file.") + module_delimiter = ":" if args.load_format == "npz" else "." - for name, param in _hf_tensorfile_iterator(tensor_file, args.load_format, + for name, param in _hf_tensorfile_iterator(tensor_file, + args.load_format, use_safetensors): if "kv_cache_scaling_factor" in name: - nums = [int(s) for s in name.split(module_delimiter) if s.isdecimal()] - assert len(nums) == 1, f"Could not determine layer idx for {name}" + nums = [ + int(s) for s in name.split(module_delimiter) + if s.isdecimal() + ] + assert len( + nums) == 1, f"Could not determine layer idx for {name}" layer_idx = nums[0] assert layer_idx not in layer_scales_map, f"Duplicate scaling " \ f"factor corresponding to layer {layer_idx}" try: layer_scales_map[layer_idx] = param.item() except RuntimeError: - print("This utility supports only per-tensor scalar scale factors " - f"for now. The tensor\n {name} = {param} \nis an invalid " - "scale factor.") + print( + "This utility supports only per-tensor scalar scale factors " + f"for now. The tensor\n {name} = {param} \nis an invalid " + "scale factor.") raise - if all(len(layer_scales_map) == 0 for layer_scales_map in rank_scales_map.values()): + if all( + len(layer_scales_map) == 0 + for layer_scales_map in rank_scales_map.values()): # Note: this is true even if the rank_scales_map is empty print("WARNING: No KV cache scale factors found. No output saved.") return None @@ -168,7 +183,9 @@ def _kv_scales_extractor(hf_tensor_files: Iterable[str], assert i in rank_scales_map, f"Expected TP world size = {empirical_tp_world_size} " \ "but did not find KV cache scaling factors " \ f"for TP rank {i}" - print(f"Found TP world size = {empirical_tp_world_size} when extracting KV cache scales!") + print( + f"Found TP world size = {empirical_tp_world_size} when extracting KV cache scales!" + ) return rank_scales_map @@ -193,21 +210,25 @@ def _metadata_extractor(quantized_model_dir: str, fail to be extracted, their corresponding values are set to None, and a warning is printed. """ if not os.path.isdir(quantized_model_dir): - raise FileNotFoundError(f"The quantized model directory `{quantized_model_dir}` " - "does not exist.") + raise FileNotFoundError( + f"The quantized model directory `{quantized_model_dir}` " + "does not exist.") metadata_files = glob.glob(os.path.join(quantized_model_dir, "*.json")) - + result = {} for file in metadata_files: with open(file) as f: try: metadata = json.load(f) except json.JSONDecodeError: - print(f"Could not parse `{file}` as a valid metadata file, skipping it.") + print( + f"Could not parse `{file}` as a valid metadata file, skipping it." + ) continue if not isinstance(metadata, dict): - print(f"The file `{file}` does not correspond to a JSON-serialized " - "dictionary, skipping it.") + print( + f"The file `{file}` does not correspond to a JSON-serialized " + "dictionary, skipping it.") continue for metadata_name, extract_fn in metadata_extract_fns.items(): try: @@ -215,10 +236,11 @@ def _metadata_extractor(quantized_model_dir: str, if metadata_name not in result: result[metadata_name] = metadata_info elif metadata_info != result[metadata_name]: - raise RuntimeError("Metadata mismatch! Originally found " - f"{metadata_name} = {result[metadata_name]} but " - f"now found {metadata_name} = {metadata_info} in " - f"`{file}`") + raise RuntimeError( + "Metadata mismatch! Originally found " + f"{metadata_name} = {result[metadata_name]} but " + f"now found {metadata_name} = {metadata_info} in " + f"`{file}`") except KeyError: # It is possible that a given file does not contain some of our selected # metadata as it could be located in some other metadata file. @@ -231,8 +253,9 @@ def _metadata_extractor(quantized_model_dir: str, # Warn if we cannot find any of the requested metadata for metadata_name in metadata_extract_fns: if metadata_name not in result: - print(f"WARNING: Unable to find requested metadata field `{metadata_name}`, " - "setting it to None.") + print( + f"WARNING: Unable to find requested metadata field `{metadata_name}`, " + "setting it to None.") result[metadata_name] = None return result @@ -244,7 +267,8 @@ def main(args): "tp_size": lambda json_dict: int(json_dict["tensor_parallel"]), "model_dtype": lambda json_dict: json_dict["dtype"] } - recovered_metadata = _metadata_extractor(args.quantized_model, metadata_extract_fns) + recovered_metadata = _metadata_extractor(args.quantized_model, + metadata_extract_fns) if args.tp_size is not None: metadata_tp_size = recovered_metadata["tp_size"] if metadata_tp_size is not None: @@ -252,14 +276,17 @@ def main(args): f"{args.tp_size} but found TP world size = {metadata_tp_size} from metadata!" expected_tp_size = args.tp_size or recovered_metadata["tp_size"] rank_keyword = "rank" - hf_tensor_files, use_safetensors = _prepare_hf_weights(args.quantized_model, args.load_format) + hf_tensor_files, use_safetensors = _prepare_hf_weights( + args.quantized_model, args.load_format) rank_scales_map = _kv_scales_extractor(hf_tensor_files, use_safetensors, rank_keyword, expected_tp_size) # Postprocess: formatting to the current schema. Consider pulling it out into a dedicated # function should it ever become more complicated. - rank_scales_map = { rank_keyword + str(rank) : - {k: scale[k] for k in sorted(scale.keys())} - for rank, scale in rank_scales_map.items() } + rank_scales_map = { + rank_keyword + str(rank): {k: scale[k] + for k in sorted(scale.keys())} + for rank, scale in rank_scales_map.items() + } # Consider generalizing and formalizing this into its own class (and other necessary # subclasses) in the future @@ -279,50 +306,57 @@ def main(args): if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir, exist_ok=True) output_file = os.path.join(args.output_dir, args.output_name) - + with open(output_file, 'w') as f: json.dump(schema, f, indent=4) print(f"Completed! KV cache scaling factors saved to {output_file}") if __name__ == "__main__": - parser = argparse.ArgumentParser(description="This simple utility extracts the " - "KV cache scaling factors from a quantized HF model " - "and saves them to a JSON file compatible with later " - "use by vLLM (pass this file to the appropriate " - "runtime typically using the argument " - "--scales-path ). This is only used " - "if the KV cache dtype is FP8 and on ROCm (AMD GPU).") - parser.add_argument("--quantized_model", - help="Specify the directory containing a single quantized HF model. " - "It is expected that the quantization format is FP8_E4M3, for use on ROCm " - "(AMD GPU).", - required=True) - parser.add_argument("--load_format", - help="Optionally specify the format of the model's tensor files " - "containing the KV cache scaling factors.", - choices=["auto", "safetensors", "npz", "pt"], - default="auto") - parser.add_argument("--output_dir", - help="Optionally specify the output directory. By default the " - "KV cache scaling factors will be saved in the model directory, " - "however you can override this behavior here.", - default=None) - parser.add_argument("--output_name", - help="Optionally specify the output filename.", - # TODO: Change this once additional scaling factors are enabled - default="kv_cache_scales.json") - parser.add_argument("--tp_size", - help="Optionally specify the tensor-parallel (TP) size that the " - "quantized model should correspond to. If specified, during KV " - "cache scaling factor extraction the observed TP size will be " - "checked against this and an error will be raised if there is " - "a mismatch. If not specified, the quantized model's expected " - "TP size is instead inferred from the largest TP rank observed. " - "The expected TP size is cross-checked against the TP ranks " - "observed in the quantized model and an error is raised if any " - "discrepancies are found.", - default=None, type=int) + parser = argparse.ArgumentParser( + description="This simple utility extracts the " + "KV cache scaling factors from a quantized HF model " + "and saves them to a JSON file compatible with later " + "use by vLLM (pass this file to the appropriate " + "runtime typically using the argument " + "--scales-path ). This is only used " + "if the KV cache dtype is FP8 and on ROCm (AMD GPU).") + parser.add_argument( + "--quantized_model", + help="Specify the directory containing a single quantized HF model. " + "It is expected that the quantization format is FP8_E4M3, for use on ROCm " + "(AMD GPU).", + required=True) + parser.add_argument( + "--load_format", + help="Optionally specify the format of the model's tensor files " + "containing the KV cache scaling factors.", + choices=["auto", "safetensors", "npz", "pt"], + default="auto") + parser.add_argument( + "--output_dir", + help="Optionally specify the output directory. By default the " + "KV cache scaling factors will be saved in the model directory, " + "however you can override this behavior here.", + default=None) + parser.add_argument( + "--output_name", + help="Optionally specify the output filename.", + # TODO: Change this once additional scaling factors are enabled + default="kv_cache_scales.json") + parser.add_argument( + "--tp_size", + help="Optionally specify the tensor-parallel (TP) size that the " + "quantized model should correspond to. If specified, during KV " + "cache scaling factor extraction the observed TP size will be " + "checked against this and an error will be raised if there is " + "a mismatch. If not specified, the quantized model's expected " + "TP size is instead inferred from the largest TP rank observed. " + "The expected TP size is cross-checked against the TP ranks " + "observed in the quantized model and an error is raised if any " + "discrepancies are found.", + default=None, + type=int) args = parser.parse_args() main(args) diff --git a/3rdparty/quantizer/quantize.py b/3rdparty/quantizer/quantize.py index dfc21e7fc3918..1ff567061ae39 100644 --- a/3rdparty/quantizer/quantize.py +++ b/3rdparty/quantizer/quantize.py @@ -264,7 +264,8 @@ def main(args): if args.qformat in QUANT_CFG_CHOICES: quant_cfg = QUANT_CFG_CHOICES[args.qformat] else: - raise ValueError(f"Unsupported quantization format: {args.qformat}") + raise ValueError( + f"Unsupported quantization format: {args.qformat}") if "awq" in args.qformat: quant_cfg = copy.deepcopy(QUANT_CFG_CHOICES[args.qformat]) @@ -302,15 +303,16 @@ def main(args): ]) # export safetensors - export_model_config(model, - model_type, - getattr(torch, args.dtype), - export_dir=export_path, - inference_tensor_parallel=args.tp_size, - inference_pipeline_parallel=args.pp_size, - # export_tensorrt_llm_config=(not export_npz), - export_tensorrt_llm_config=False, - export_npz=export_npz) + export_model_config( + model, + model_type, + getattr(torch, args.dtype), + export_dir=export_path, + inference_tensor_parallel=args.tp_size, + inference_pipeline_parallel=args.pp_size, + # export_tensorrt_llm_config=(not export_npz), + export_tensorrt_llm_config=False, + export_npz=export_npz) # Workaround for wo quantization if args.qformat in ["int8_wo", "int4_wo", "full_prec"]: @@ -326,9 +328,8 @@ def main(args): json.dump(tensorrt_llm_config, f, indent=4) end_time = time.time() - print( - "Quantized model exported to {} \nTotal time used {:.2f} s.".format( - export_path, end_time - start_time)) + print("Quantized model exported to {} \nTotal time used {:.2f} s.". + format(export_path, end_time - start_time)) if __name__ == "__main__": diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index cf6acc1847491..0eabd1f66ffc5 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -129,9 +129,11 @@ def run_to_completion(profile_dir: Optional[str] = None): type=str, choices=['auto', 'fp8'], default='auto', - help='Data type for kv cache storage. If "auto", will use model data type. ' + help= + 'Data type for kv cache storage. If "auto", will use model data type. ' 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' - 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') + 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.' + ) parser.add_argument( '--scales-path', type=str, @@ -140,7 +142,8 @@ def run_to_completion(profile_dir: Optional[str] = None): 'This should generally be supplied, when KV cache dtype is FP8. Otherwise, ' 'KV cache scaling factors default to 1.0, which may cause accuracy issues. ' 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' - 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') + 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.' + ) parser.add_argument( '--profile', action='store_true', diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 69879cea28f4d..cb940c92ec982 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -210,12 +210,14 @@ def main(args: argparse.Namespace): args.output_len) if args.backend == "vllm": - elapsed_time = run_vllm( - requests, args.model, args.tokenizer, args.quantization, - args.tensor_parallel_size, args.seed, args.n, args.use_beam_search, - args.trust_remote_code, args.dtype, args.max_model_len, - args.enforce_eager, args.kv_cache_dtype,args.scales_path,args.device, - args.enable_prefix_caching, args.gpu_memory_utilization) + elapsed_time = run_vllm(requests, args.model, args.tokenizer, + args.quantization, args.tensor_parallel_size, + args.seed, args.n, args.use_beam_search, + args.trust_remote_code, args.dtype, + args.max_model_len, args.enforce_eager, + args.kv_cache_dtype, args.scales_path, + args.device, args.enable_prefix_caching, + args.gpu_memory_utilization) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -304,9 +306,11 @@ def main(args: argparse.Namespace): type=str, choices=["auto", "fp8"], default="auto", - help='Data type for kv cache storage. If "auto", will use model data type. ' + help= + 'Data type for kv cache storage. If "auto", will use model data type. ' 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' - 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') + 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.' + ) parser.add_argument( '--scales-path', type=str, @@ -315,7 +319,8 @@ def main(args: argparse.Namespace): 'This should generally be supplied, when KV cache dtype is FP8. Otherwise, ' 'KV cache scaling factors default to 1.0, which may cause accuracy issues. ' 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' - 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') + 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.' + ) parser.add_argument( "--device", type=str, diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index c1d6dacc6a0d4..3ba7c7b908f8f 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -186,9 +186,11 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: type=str, choices=["auto", "fp8"], default="auto", - help='Data type for kv cache storage. If "auto", will use model data type. ' + help= + 'Data type for kv cache storage. If "auto", will use model data type. ' 'FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. ' - 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.') + 'On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria.' + ) args = parser.parse_args() print(args) diff --git a/setup.py b/setup.py index e7ed200a275ef..bf542dcd47b89 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,10 @@ # Supported NVIDIA GPU architectures. NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"} -ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx940", "gfx941", "gfx942", "gfx1030", "gfx1100"} +ROCM_SUPPORTED_ARCHS = { + "gfx90a", "gfx908", "gfx906", "gfx940", "gfx941", "gfx942", "gfx1030", + "gfx1100" +} # SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS) @@ -67,6 +70,7 @@ def _is_cuda() -> bool: CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] + def get_amdgpu_offload_arch(): command = "/opt/rocm/llvm/bin/amdgpu-offload-arch" try: @@ -82,6 +86,7 @@ def get_amdgpu_offload_arch(): return None + def get_hipcc_rocm_version(): # Run the hipcc --version command result = subprocess.run(['hipcc', '--version'], diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index b270d935924c1..36af1614546e7 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -135,9 +135,9 @@ def test_reshape_and_cache( # Create the KV caches. key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1, - num_heads, head_size, - kv_cache_dtype, dtype, - seed, gpu_id) + num_heads, head_size, + kv_cache_dtype, dtype, seed, + gpu_id) key_cache, value_cache = key_caches[0], value_caches[0] # Clone the KV caches. @@ -156,7 +156,7 @@ def test_reshape_and_cache( # Call the reshape_and_cache kernel. cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype, kv_scale) - + if kv_cache_dtype == "fp8": result_key_cache = torch.empty_like(key_cache, dtype=torch.float16) cache_ops.convert_fp8(key_cache, result_key_cache) @@ -174,12 +174,16 @@ def test_reshape_and_cache( block_offset = block_offsets[i] cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i] cloned_value_cache[block_idx, :, :, block_offset] = value[i] - + if kv_cache_dtype == "fp8": - assert torch.allclose(result_key_cache, cloned_key_cache, - atol=0.001, rtol=0.1) - assert torch.allclose(result_value_cache, cloned_value_cache, - atol=0.001, rtol=0.1) + assert torch.allclose(result_key_cache, + cloned_key_cache, + atol=0.001, + rtol=0.1) + assert torch.allclose(result_value_cache, + cloned_value_cache, + atol=0.001, + rtol=0.1) else: assert torch.allclose(key_cache, cloned_key_cache) assert torch.allclose(value_cache, cloned_value_cache) @@ -230,18 +234,14 @@ def test_swap_blocks( block_mapping = dict(zip(src_blocks, dst_blocks)) # Create the KV caches on the first device. - src_key_caches, src_value_caches = kv_cache_factory(num_blocks, - block_size, 1, - num_heads, head_size, - kv_cache_dtype, dtype, - seed, src_device) + src_key_caches, src_value_caches = kv_cache_factory( + num_blocks, block_size, 1, num_heads, head_size, kv_cache_dtype, dtype, + seed, src_device) # Create the KV caches on the second device. - dist_key_caches, dist_value_caches = kv_cache_factory(num_blocks, - block_size, 1, - num_heads, head_size, - kv_cache_dtype, dtype, - seed, dst_device) + dist_key_caches, dist_value_caches = kv_cache_factory( + num_blocks, block_size, 1, num_heads, head_size, kv_cache_dtype, dtype, + seed, dst_device) src_key_caches_clone = src_key_caches[0].clone() src_value_caches_clone = src_value_caches[0].clone() @@ -257,6 +257,7 @@ def test_swap_blocks( assert torch.allclose(src_value_caches_clone[src].cpu(), dist_value_caches[0][dst].cpu()) + @pytest.mark.skipif(not is_hip(), reason="FP8 conversion test requires e4m3") @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) diff --git a/vllm/config.py b/vllm/config.py index 4c2743942745c..8b492078363f7 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -356,8 +356,7 @@ def _verify_cache_dtype(self) -> None: if nvcc_cuda_version < Version("11.8"): raise ValueError( "FP8 is not supported when cuda version is" - "lower than 11.8." - ) + "lower than 11.8.") logger.info( "Using fp8 data type to store kv cache. It reduces the GPU " "memory footprint and boosts the performance. " diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b2a6227fb4054..0979d19ce3098 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -142,7 +142,7 @@ def add_cli_args( choices=['auto', 'fp8'], default='auto', help='Data type for kv cache storage. If "auto", will use model ' - 'data type. FP8_E5M2 (without scaling) is only supported on cuda ' + 'data type. FP8_E5M2 (without scaling) is only supported on cuda ' 'version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead ' 'supported for common inference criteria. ') parser.add_argument( @@ -317,9 +317,8 @@ def create_engine_configs( device_config = DeviceConfig(self.device) model_config = ModelConfig( self.model, self.tokenizer, self.tokenizer_mode, - self.trust_remote_code, self.download_dir, - self.load_format, self.dtype, self.seed, - self.revision, self.code_revision, + self.trust_remote_code, self.download_dir, self.load_format, + self.dtype, self.seed, self.revision, self.code_revision, self.tokenizer_revision, self.max_model_len, self.quantization, self.scales_path, self.enforce_eager, self.max_context_len_to_capture, self.max_logprobs) diff --git a/vllm/model_executor/layers/attention/backends/xformers.py b/vllm/model_executor/layers/attention/backends/xformers.py index ffe7e36569f83..bfa496ddf7875 100644 --- a/vllm/model_executor/layers/attention/backends/xformers.py +++ b/vllm/model_executor/layers/attention/backends/xformers.py @@ -33,7 +33,7 @@ def __init__( if alibi_slopes is not None: alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) self.register_buffer("alibi_slopes", alibi_slopes, persistent=False) - + # This will be set to a float by model initialization per attention, # if and only if we are using it. N.B. currently we only support per # tensor scalar scaling factors & only applicable to ROCm (AMD GPU). @@ -88,9 +88,14 @@ def forward( # vectors will not be cached. This happens during the initial memory # profiling run. if key_cache is not None and value_cache is not None: - PagedAttentionImpl.reshape_and_cache(key, value, key_cache, - value_cache, input_metadata, - self.kv_cache_scaling_factor,) + PagedAttentionImpl.reshape_and_cache( + key, + value, + key_cache, + value_cache, + input_metadata, + self.kv_cache_scaling_factor, + ) if input_metadata.is_prompt: # Prompt run. if (key_cache is None or value_cache is None @@ -171,7 +176,7 @@ def forward( else: # prefix-enabled attention - # TODO(Hai) this triton kernel has regression issue with + # TODO(Hai) this triton kernel has regression issue with # FP8 KVCache to handle mixed types output = PagedAttentionImpl.forward_prefix( query, diff --git a/vllm/model_executor/layers/attention/ops/paged_attn.py b/vllm/model_executor/layers/attention/ops/paged_attn.py index 131b900fe5108..1efbf1177bb26 100644 --- a/vllm/model_executor/layers/attention/ops/paged_attn.py +++ b/vllm/model_executor/layers/attention/ops/paged_attn.py @@ -27,15 +27,10 @@ def reshape_and_cache( input_metadata: InputMetadata, kv_cache_scaling_factor: float, ) -> None: - cache_ops.reshape_and_cache( - key, - value, - key_cache, - value_cache, - input_metadata.slot_mapping.flatten(), - input_metadata.kv_cache_dtype, - kv_cache_scaling_factor - ) + cache_ops.reshape_and_cache(key, value, key_cache, value_cache, + input_metadata.slot_mapping.flatten(), + input_metadata.kv_cache_dtype, + kv_cache_scaling_factor) @staticmethod def forward_decode( diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index f50cd39271106..80770d5dfaafb 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -41,8 +41,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE) from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size) + get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.weight_utils import (default_weight_loader, hf_model_weights_iterator, @@ -392,7 +391,7 @@ def load_weights(self, weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) - + # Should not be called unless the KV cache dtype is FP8 on ROCm (AMD GPU) # If this function is called, it should always initialize KV cache scale # factors (or else raise an exception). Thus, handled exceptions should @@ -401,13 +400,10 @@ def load_kv_cache_scales(self, scales_path: str) -> None: tp_size = get_tensor_model_parallel_world_size() tp_rank = get_tensor_model_parallel_rank() for layer_idx, scaling_factor in kv_cache_scales_loader( - scales_path, tp_rank, tp_size, - self.config.num_hidden_layers, - self.config.__class__.model_type): + scales_path, tp_rank, tp_size, self.config.num_hidden_layers, + self.config.__class__.model_type): layer_paged_attn = ( - self.model.layers[layer_idx]. - self_attn.attn.backend - ) + self.model.layers[layer_idx].self_attn.attn.backend) if is_hip(): # The scaling factor convention we are assuming is diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py index 124269a6ffe13..9ab361d80bfeb 100644 --- a/vllm/model_executor/weight_utils.py +++ b/vllm/model_executor/weight_utils.py @@ -265,12 +265,9 @@ def hf_model_weights_iterator( torch.cuda.empty_cache() -def kv_cache_scales_loader(filename: str, - tp_rank: int, - tp_size: int, - num_hidden_layers: int, - model_type: Optional[str] - )-> Iterable[Tuple[int, float]]: +def kv_cache_scales_loader( + filename: str, tp_rank: int, tp_size: int, num_hidden_layers: int, + model_type: Optional[str]) -> Iterable[Tuple[int, float]]: """ A simple utility to read in KV cache scaling factors that have been previously serialized to disk. Used by the model to populate the appropriate @@ -282,21 +279,21 @@ def kv_cache_scales_loader(filename: str, """ try: with open(filename) as f: - # Loading and processing the entire dictionary at once allows us + # Loading and processing the entire dictionary at once allows us # to do sanity checks all at once and avoid a situation where we - # have to abort after having partially loaded scaling factors + # have to abort after having partially loaded scaling factors # Since the number of layers is small and (for now) we use scalar # scaling factors (so the size they use is also small), this is # not a concern at present. schema = json.load(f, parse_int=int, parse_constant=float) - + malformed_schema_str = "Malformed schema detected." - # If any of the inputs are malformed or mismatched, - # it raises an error somewhere in the following + # If any of the inputs are malformed or mismatched, + # it raises an error somewhere in the following # lines and is caught in except. assert isinstance(schema, dict), malformed_schema_str if schema["model_type"] is not None: - assert model_type == schema["model_type"],( + assert model_type == schema["model_type"],( f"Model type is {model_type} but loaded " \ f"scaling factors belonging to different " \ f"model type {schema['model_type']}!" ) @@ -304,55 +301,56 @@ def kv_cache_scales_loader(filename: str, assert schema["kv_cache"]["dtype"] == "float8_e4m3fn",( f"Loaded scaling factors intended " \ f"for KV cache dtype = {schema['kv_cache']['dtype']}" \ - f"rather than float8_e4m3fn!") + f"rather than float8_e4m3fn!") if not isinstance(schema["kv_cache"]["scaling_factor"], dict): - raise AssertionError(malformed_schema_str) + raise AssertionError(malformed_schema_str) raw_rank_scales_map = schema["kv_cache"]["scaling_factor"] # The keys in raw_rank_scales_map should be strings with the format - # f"{rank_keyword}{tp_rank}", where rank_keyword is an - # alphabetical string shared amongst all keys and tp_rank + # f"{rank_keyword}{tp_rank}", where rank_keyword is an + # alphabetical string shared amongst all keys and tp_rank # is a numeric string. Thus, recovering the alphabetical # components of any key should return rank_keyword - rank_keyword = "".join(char for char in - next(iter(raw_rank_scales_map.keys())) - if char.isalpha()) + rank_keyword = "".join( + char for char in next(iter(raw_rank_scales_map.keys())) + if char.isalpha()) rank_scales_map = { - int(rank.replace(rank_keyword, "")) : scales_map - for rank, scales_map in raw_rank_scales_map.items()} + int(rank.replace(rank_keyword, "")): scales_map + for rank, scales_map in raw_rank_scales_map.items() + } assert len(rank_scales_map) != 0, \ "Loaded KV scales dictionary is empty." loaded_tp_size = max(rank_scales_map.keys()) + 1 - assert loaded_tp_size == tp_size, ( + assert loaded_tp_size == tp_size, ( f"Loaded dictionary has TP size {loaded_tp_size} " \ f"but LLM engine is currently running with TP size {tp_size}." ) for rank, scales_map in rank_scales_map.items(): assert isinstance(scales_map, dict), malformed_schema_str - assert len(scales_map) == num_hidden_layers, ( + assert len(scales_map) == num_hidden_layers, ( f"KV cache scales map for TP rank {rank} is malformed." \ - f"Expected {num_hidden_layers} layers, " + f"Expected {num_hidden_layers} layers, " f"got {len(scales_map)}." ) for i in range(tp_size): assert i in rank_scales_map, ( - f"KV cache scales map for TP rank {i} not found." - ) - assert tp_rank in rank_scales_map, ( + f"KV cache scales map for TP rank {i} not found.") + assert tp_rank in rank_scales_map, ( "Tried to load KV cache scales for TP rank " \ f"{tp_rank} but these were not found." ) scales_map = rank_scales_map.get(tp_rank) assert isinstance(scales_map, dict), malformed_schema_str layer_scales_map = { - int(layer_idx): float(scale) - for layer_idx, scale in rank_scales_map[tp_rank].items()} + int(layer_idx): float(scale) + for layer_idx, scale in rank_scales_map[tp_rank].items() + } for i in range(num_hidden_layers): assert i in layer_scales_map, ( "Could not find KV cache scales for layer " \ f"{i} in TP rank {tp_rank}." ) return layer_scales_map.items() - + except FileNotFoundError: logger.error(f"File or directory '{filename}' not found.") except json.JSONDecodeError: @@ -362,7 +360,7 @@ def kv_cache_scales_loader(filename: str, # This section is reached if and only if any of the excepts are hit # Return an empty iterable (tuple) => no KV cache scales are loaded # which effectively defaults to 1.0 scales - logger.warn("Defaulting to KV cache scaling factors = 1.0 " + logger.warn("Defaulting to KV cache scaling factors = 1.0 " f"for all layers in TP rank {tp_rank} " "as an error occurred during loading.") return () diff --git a/vllm/utils.py b/vllm/utils.py index c0c3f58e86ad7..d3ae9252160a9 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -291,8 +291,8 @@ def create_kv_caches_with_random( elif cache_dtype == 'fp8': _generate_random_fp8(key_cache, -scale, scale) else: - raise ValueError( - f"Does not support key cache of type {cache_dtype}") + raise ValueError( + f"Does not support key cache of type {cache_dtype}") key_caches.append(key_cache) value_cache_shape = (num_blocks, num_heads, head_size, block_size) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 9ca32d280677f..0c5cf629c857f 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -112,11 +112,12 @@ def load_model(self) -> None: self.lora_config, self.device, self.model.embedding_modules, self.model.embedding_padding_modules) self.model = self.lora_manager.create_lora_manager(self.model) - + if self.kv_cache_dtype == "fp8": if self.model_config.scales_path is not None: if callable(getattr(self.model, "load_kv_cache_scales", None)): - self.model.load_kv_cache_scales(self.model_config.scales_path) + self.model.load_kv_cache_scales( + self.model_config.scales_path) else: raise RuntimeError("Using FP8 KV cache and scaling " "factors provided but model " @@ -127,11 +128,10 @@ def load_model(self) -> None: "provided. Defaulting to scaling factors of 1.0. " "This may lead to less accurate results!") elif self.model_config.scales_path is not None: - logger.warn("KV cache scaling factors provided, " - "but the KV cache data type is not FP8. " + logger.warn("KV cache scaling factors provided, " + "but the KV cache data type is not FP8. " "KV cache scaling factors will not be used.") - def set_block_size(self, block_size: int) -> None: self.block_size = block_size From e56913312f3e43ba0110c5caa391d4fb35fa2069 Mon Sep 17 00:00:00 2001 From: jpvillam Date: Fri, 15 Mar 2024 13:15:54 -0400 Subject: [PATCH 140/159] Adding new rocm triton flash attention kernel Co-authored-by: Vinayak Gokhale --- Dockerfile.rocm | 14 + .../layers/attention/attention.py | 2 +- .../layers/attention/backends/flash_attn.py | 40 +- .../attention/ops/flash_attention_triton.py | 541 ++++++++++++++++++ 4 files changed, 586 insertions(+), 11 deletions(-) create mode 100644 vllm/model_executor/layers/attention/ops/flash_attention_triton.py diff --git a/Dockerfile.rocm b/Dockerfile.rocm index a45265d79a6ac..a7640f6841ad9 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -26,6 +26,9 @@ ARG BUILD_FA="1" # whether to build cupy on rocm ARG BUILD_CUPY="1" +# whether to build triton on rocm +ARG BUILD_TRITON="1" + # Install some basic utilities RUN apt-get update && apt-get install python3 python3-pip -y @@ -95,6 +98,17 @@ RUN if [ "$BUILD_CUPY" = "1" ]; then \ && cd ..; \ fi +# build triton +RUN if [ "$BUILD_TRITON" = "1"]; then \ + mkdir -p libs \ + && cd libs \ + && pip uninstall -y triton \ + && git clone https://github.com/ROCmSoftwarePlatform/triton.git + && cd triton/python \ + && pip3 install -e . \ + && cd ../..; \ + fi + COPY ./ /app/vllm RUN python3 -m pip install --upgrade pip diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py index 830e82e10f7ad..39b66fb1fb2db 100644 --- a/vllm/model_executor/layers/attention/attention.py +++ b/vllm/model_executor/layers/attention/attention.py @@ -30,7 +30,7 @@ def __init__( sliding_window: Optional[int] = None, ) -> None: super().__init__() - if (not is_hip() and torch.cuda.get_device_capability()[0] >= 8 and + if (torch.cuda.get_device_capability()[0] >= 8 and torch.get_default_dtype() in (torch.float16, torch.bfloat16)): # Ampere or later NVIDIA GPUs. # NOTE(woosuk): FlashAttention does not support FP32. diff --git a/vllm/model_executor/layers/attention/backends/flash_attn.py b/vllm/model_executor/layers/attention/backends/flash_attn.py index 512f4e49c7eb2..c7543e2d54d12 100644 --- a/vllm/model_executor/layers/attention/backends/flash_attn.py +++ b/vllm/model_executor/layers/attention/backends/flash_attn.py @@ -2,12 +2,21 @@ from typing import List, Optional # NOTE(woosuk): This imports flash_attn under vllm/thirdparty_files/. -from flash_attn import flash_attn_func +from vllm.utils import is_hip +try: + from flash_attn import flash_attn_func +except ImportError: + if is_hip(): + pass + else: + raise + import torch from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.attention.ops.paged_attn import ( PagedAttentionImpl) +from vllm.model_executor.layers.attention.ops.flash_attention_triton import attention class FlashAttentionBackend: @@ -86,15 +95,26 @@ def forward( query = query.unflatten(0, (batch_size, seq_len)) key = key.unflatten(0, (batch_size, seq_len)) value = value.unflatten(0, (batch_size, seq_len)) - output = flash_attn_func( - query, - key, - value, - softmax_scale=self.scale, - causal=True, - window_size=self.sliding_window, - alibi_slopes=self.alibi_slopes, - ) + if is_hip(): + output, _ = attention( + query, + key, + value, + None, + input_metadata, + True, + self.scale, + ) + else: + output = flash_attn_func( + query, + key, + value, + softmax_scale=self.scale, + causal=True, + window_size=self.sliding_window, + alibi_slopes=self.alibi_slopes, + ) else: # prefix-enabled attention output = PagedAttentionImpl.forward_prefix( diff --git a/vllm/model_executor/layers/attention/ops/flash_attention_triton.py b/vllm/model_executor/layers/attention/ops/flash_attention_triton.py new file mode 100644 index 0000000000000..37c15e0e6fa36 --- /dev/null +++ b/vllm/model_executor/layers/attention/ops/flash_attention_triton.py @@ -0,0 +1,541 @@ +#!/usr/bin/env python +""" +Fused Attention +=============== + +This is a Triton implementation of the Flash Attention v2 algorithm from Tri Dao (https://tridao.me/publications/flash2/flash2.pdf) +Credits: OpenAI kernel team, AMD ML Frameworks Triton team + +Features supported: + +1) Fwd with causal masking +2) Any sequence lengths without padding (currently fwd kernel only) +3) Support for different sequence lengths for q and k +4) Nested tensor API currently does not support dropout or bias. + +Not currently supported: + +1) Non power of two head dims + +""" + +import torch +import triton +import triton.language as tl + +torch_dtype:tl.constexpr = torch.float16 + +TORCH_HAS_FP8E5 = hasattr(torch, 'float8_e5m2fnuz') +if TORCH_HAS_FP8E5: + torch_dtype:tl.constexpr = torch.float8_e5m2fnuz + +@triton.jit +def cdiv_fn(x,y): + return (x + y - 1) // y + +@triton.jit +def max_fn(x, y): + return tl.math.max(x, y) + +@triton.jit +def dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride): + ms = tl.arange(0, m) + ns = tl.arange(0, n) + return philox_offset + ms[:, None] * stride + ns[None, :] + +@triton.jit +def dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride): + rng_offsets = dropout_offsets(philox_seed, philox_offset, dropout_p, m, n, stride).to(tl.uint32) + # TODO: use tl.randint for better performance + return tl.rand(philox_seed, rng_offsets) + +@triton.jit +def dropout_mask(philox_seed, philox_offset, dropout_p, m, n, stride): + rng_output = dropout_rng(philox_seed, philox_offset, dropout_p, m, n, stride) + rng_keep = rng_output > dropout_p + return rng_keep + +@triton.jit +def load_fn(block_ptr, first, second, pad): + if first and second: + tensor = tl.load(block_ptr, boundary_check=(0,1), padding_option=pad) + elif first: + tensor = tl.load(block_ptr, boundary_check=(0,), padding_option=pad) + elif second: + tensor = tl.load(block_ptr, boundary_check=(1,), padding_option=pad) + else: + tensor = tl.load(block_ptr) + return tensor + +@triton.jit +def _attn_fwd_inner( + acc, l_i, m_i, q, + K_block_ptr, V_block_ptr, + start_m, + actual_seqlen_k, + dropout_p, + philox_seed, + batch_philox_offset, + encoded_softmax_block_ptr, + block_min, block_max, + offs_n_causal, + masked_blocks, + n_extra_tokens, + bias_ptr, + IS_CAUSAL: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_N: tl.constexpr, + OFFS_M: tl.constexpr, + OFFS_N: tl.constexpr, + PRE_LOAD_V: tl.constexpr, + MASK_STEPS: tl.constexpr, + ENABLE_DROPOUT: tl.constexpr, + RETURN_ENCODED_SOFTMAX: tl.constexpr, + PADDED_HEAD: tl.constexpr +): + # loop over k, v, and update accumulator + for start_n in range (block_min, block_max, BLOCK_N): + # For padded blocks, we will overrun the tensor size if + # we load all BLOCK_N. For others, the blocks are all within range. + k = load_fn(K_block_ptr, PADDED_HEAD, MASK_STEPS and (n_extra_tokens != 0), "zero") + if PRE_LOAD_V: + v = load_fn(V_block_ptr, MASK_STEPS and (n_extra_tokens != 0), PADDED_HEAD, "zero") + qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) + # We start from end of seqlen_k so only the first iteration would need + # to be checked for padding if it is not a multiple of block_n + # TODO: This can be optimized to only be true for the padded block. + if MASK_STEPS: + # If this is the last block / iteration, we want to + # mask if the sequence length is not a multiple of block size + # a solution is to always do BLOCK_M // BLOCK_N + 1 steps if not is_modulo_mn. + # last step might get wasted but that is okay. check if this masking works For + # that case. + if (start_n + BLOCK_N == block_max) and (n_extra_tokens != 0): + boundary_m = tl.full([BLOCK_M], actual_seqlen_k, dtype=tl.int32) + size_n = start_n + OFFS_N[None,:] + mask = size_n < boundary_m[:,None] + qk = tl.where(mask, qk, float("-inf")) + if IS_CAUSAL: + causal_boundary = start_n + offs_n_causal + causal_mask = OFFS_M[:, None] >= causal_boundary[None, :] + qk = tl.where(causal_mask, qk, float("-inf")) + # -- compute qk ---- + qk += tl.dot(q, k) + if bias_ptr is not None: + bias = load_fn(bias_ptr, False, MASK_STEPS and (n_extra_tokens != 0), "zero") + # While bias is added after multiplying qk with sm_scale, + # our optimization to use 2^x instead of e^x results in an additional + # scale factor of log2(e) which we must also multiply the bias with. + qk += (bias * 1.44269504089) + m_ij = tl.maximum(m_i, tl.max(qk,1)) + qk = qk - m_ij[:, None] + p = tl.math.exp2(qk) + + # CAVEAT: Must update l_ij before applying dropout + l_ij = tl.sum(p, 1) + if ENABLE_DROPOUT: + philox_offset = batch_philox_offset + start_m * BLOCK_M * actual_seqlen_k + start_n - BLOCK_N + keep = dropout_mask(philox_seed, philox_offset, dropout_p, BLOCK_M, BLOCK_N, actual_seqlen_k) + if RETURN_ENCODED_SOFTMAX: + tl.store(encoded_softmax_block_ptr, tl.where(keep, p, -p).to(encoded_softmax_block_ptr.type.element_ty)) + p = tl.where(keep, p, 0.0) + elif RETURN_ENCODED_SOFTMAX: + tl.store(encoded_softmax_block_ptr, p.to(encoded_softmax_block_ptr.type.element_ty)) + # -- update output accumulator -- + alpha = tl.math.exp2(m_i - m_ij) + acc = acc * alpha[:, None] + if not PRE_LOAD_V: + v = load_fn(V_block_ptr, MASK_STEPS and (n_extra_tokens != 0), PADDED_HEAD, "zero") + # -- update m_i and l_i + l_i = l_i * alpha + l_ij + # update m_i and l_i + m_i = m_ij + acc += tl.dot(p.to(V_block_ptr.type.element_ty), v) + V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0)) + K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N)) + if bias_ptr is not None: + bias_ptr = tl.advance(bias_ptr, (0, BLOCK_N)) + if RETURN_ENCODED_SOFTMAX: + encoded_softmax_block_ptr = tl.advance(encoded_softmax_block_ptr, (0, BLOCK_N)) + return acc, l_i, m_i + +@triton.autotune( + configs=[ + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 64, 'waves_per_eu': 2, 'PRE_LOAD_V': False}, num_stages=1, num_warps=8), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'waves_per_eu': 2, 'PRE_LOAD_V': False}, num_stages=1, num_warps=4), + triton.Config({'BLOCK_M': 256, 'BLOCK_N': 128, 'waves_per_eu': 2, 'PRE_LOAD_V': False}, num_stages=1, num_warps=8), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'waves_per_eu': 3, 'PRE_LOAD_V': True}, num_stages=1, num_warps=4), + triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'waves_per_eu': 3, 'PRE_LOAD_V': False}, num_stages=1, num_warps=4), + triton.Config({'BLOCK_M': 64, 'BLOCK_N': 64, 'waves_per_eu': 4, 'PRE_LOAD_V': False}, num_stages=1, num_warps=8), + triton.Config({'BLOCK_M': 32, 'BLOCK_N': 32, 'waves_per_eu': 4, 'PRE_LOAD_V': False}, num_stages=1, num_warps=8), + # TODO: This config fails with head_size not pow2 with data mismatches. Check why. + # triton.Config({'BLOCK_M': 32, 'BLOCK_N': 16, 'waves_per_eu': 1, 'PRE_LOAD_V': False}, num_stages=1, num_warps=4), + triton.Config({'BLOCK_M': 16, 'BLOCK_N': 16, 'waves_per_eu': 1, 'PRE_LOAD_V': False}, num_stages=1, num_warps=4), + ], + key=['hq', 'hk', 'IS_CAUSAL', 'dropout_p', 'BLOCK_DMODEL'], +) +@triton.jit +def attn_fwd( + Q, K, V, bias, sm_scale, L, Out, + stride_qz, stride_qh, stride_qm, stride_qk, + stride_kz, stride_kh, stride_kn, stride_kk, + stride_vz, stride_vh, stride_vk, stride_vn, + stride_oz, stride_oh, stride_om, stride_on, + stride_bz, stride_bh, stride_bm, stride_bn, + cu_seqlens_q, cu_seqlens_k, + dropout_p, philox_seed, philox_offset_base, encoded_softmax, + hq, hk, + ACTUAL_BLOCK_DMODEL:tl.constexpr, + MAX_SEQLENS_Q:tl.constexpr, MAX_SEQLENS_K:tl.constexpr, + VARLEN: tl.constexpr, + IS_CAUSAL: tl.constexpr, + BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr, BLOCK_N: tl.constexpr, + PRE_LOAD_V: tl.constexpr, + BIAS_TYPE: tl.constexpr, + ENABLE_DROPOUT: tl.constexpr, RETURN_ENCODED_SOFTMAX: tl.constexpr +): + start_m = tl.program_id(0) + off_h_q = tl.program_id(1) + off_z = tl.program_id(2) + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + offs_n = tl.arange(0, BLOCK_N) + if VARLEN: + cu_seqlens_q_start = tl.load(cu_seqlens_q + off_z) + cu_seqlens_q_end = tl.load(cu_seqlens_q + off_z + 1) + seqlen_q = cu_seqlens_q_end - cu_seqlens_q_start + # We have a one-size-fits-all grid in id(0). Some seqlens might be too + # small for all start_m so for those we return early. + if start_m * BLOCK_M > seqlen_q: + return + cu_seqlens_k_start = tl.load(cu_seqlens_k + off_z) + cu_seqlens_k_end = tl.load(cu_seqlens_k + off_z + 1) + seqlen_k = cu_seqlens_k_end - cu_seqlens_k_start + else: + cu_seqlens_q_start = 0 + cu_seqlens_k_start = 0 + seqlen_q = MAX_SEQLENS_Q + seqlen_k = MAX_SEQLENS_K + + # Now we compute whether we need to exit early due to causal masking. + # This is because for seqlen_q > seqlen_k, M rows of the attn scores + # are completely masked, resulting in 0s written to the output, and + # inf written to LSE. We don't need to do any GEMMs in this case. + # This block of code determines what N is, and if this WG is operating + # on those M rows. + n_blocks = cdiv_fn(seqlen_k, BLOCK_N) + if (IS_CAUSAL): + # If seqlen_q == seqlen_k, the attn scores are a square matrix. + # If seqlen_q != seqlen_k, attn scores are rectangular which means + # the causal mask boundary is bottom right aligned, and ends at either + # the top edge (seqlen_q < seqlen_k) or left edge. + # This captures the decrease in n_blocks if we have a rectangular attn matrix + n_blocks_seqlen = cdiv_fn( + (start_m + 1) * BLOCK_M + seqlen_k - seqlen_q, + BLOCK_N + ) + # This is what adjusts the block_max for the current WG, only + # if IS_CAUSAL. Otherwise we want to always iterate through all n_blocks + n_blocks = min(n_blocks, n_blocks_seqlen) + # If we have no blocks after adjusting for seqlen deltas, this WG is part of + # the blocks that are all 0. We exit early. + if n_blocks <= 0: + o_offset = off_z * stride_oz + cu_seqlens_q_start * stride_om + off_h_q * stride_oh + O_block_ptr = tl.make_block_ptr( + base=Out + o_offset, + shape=(seqlen_q, BLOCK_DMODEL), + strides=(stride_om, stride_on), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_DMODEL), + order=(1, 0) + ) + acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=Out.type.element_ty) + # We still need to write 0s to the result + tl.store(O_block_ptr, acc.to(Out.type.element_ty), boundary_check=(0,1)) + l_ptrs = L + off_z * hq * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m + # We store inf to LSE, not -inf because in the bwd pass, we subtract this + # from qk which makes it -inf, such that exp(qk - inf) = 0 for these masked blocks. + l = tl.full([BLOCK_M], value=float("inf"), dtype=tl.float32) + tl.store(l_ptrs, l) + # TODO: Should dropout and return encoded softmax be handled here too? + return + + is_mqa = hq != hk + off_h_k = off_h_q % hk if is_mqa else off_h_q + need_padding = False + n_extra_tokens = 0 + if seqlen_k < BLOCK_N: + need_padding = True + n_extra_tokens = BLOCK_N - seqlen_k + elif seqlen_k % BLOCK_N: + need_padding = True + n_extra_tokens = seqlen_k % BLOCK_N + padded_head = (ACTUAL_BLOCK_DMODEL != BLOCK_DMODEL) + + # Compute pointers for all the tensors used in this kernel. + q_offset = off_z * stride_qz + off_h_q * stride_qh + cu_seqlens_q_start * stride_qm + Q_block_ptr = tl.make_block_ptr( + base=Q + q_offset, + shape=(seqlen_q, ACTUAL_BLOCK_DMODEL), + strides=(stride_qm, stride_qk), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_DMODEL), + order=(1, 0) + ) + k_offset = off_z * stride_kz + off_h_k * stride_kh + cu_seqlens_k_start * stride_kn + K_block_ptr = tl.make_block_ptr( + base=K + k_offset, + shape=(ACTUAL_BLOCK_DMODEL, seqlen_k), + strides=(stride_kk, stride_kn), + offsets=(0, 0), + block_shape=(BLOCK_DMODEL, BLOCK_N), + order=(0, 1) + ) + v_offset = off_z * stride_vz + off_h_k * stride_vh + cu_seqlens_k_start * stride_vk + V_block_ptr = tl.make_block_ptr( + base=V + v_offset, + shape=(seqlen_k, ACTUAL_BLOCK_DMODEL), + strides=(stride_vk, stride_vn), + offsets=(0, 0), + block_shape=(BLOCK_N, BLOCK_DMODEL), + order=(1, 0) + ) + if BIAS_TYPE != 0: + bias_ptr = tl.make_block_ptr( + base=bias + off_h_q * stride_bh, + shape=(seqlen_q, seqlen_k), + strides=(stride_bm, stride_bn), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_N), + order=(1, 0), + ) + else: + bias_ptr = None + if ENABLE_DROPOUT: + batch_philox_offset = philox_offset_base + off_hz * seqlen_q * seqlen_k + else: + batch_philox_offset = 0 + # We can ask to return the dropout mask without actually doing any dropout. In + # this case, we return an invalid pointer so indicate the mask is not valid. + # TODO: Fix encoded softmax. It currently uses just h_q in the base offset. + if RETURN_ENCODED_SOFTMAX: + encoded_softmax_block_ptr = tl.make_block_ptr( + base=encoded_softmax + off_h_q * seqlen_q * seqlen_k, + shape=(seqlen_q, seqlen_k), + strides=(seqlen_k, 1), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_N), + order=(1, 0) + ) + else: + encoded_softmax_block_ptr = 0 + # initialize pointer to m and l + m_i = tl.full([BLOCK_M], float("-inf"), dtype=tl.float32) + l_i = tl.full([BLOCK_M], 1.0, dtype=tl.float32) + acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) + # scale sm_scale by log_2(e) and use 2^x in the loop as we do not + # have native e^x support in HW. + qk_scale = sm_scale * 1.44269504089 + # Q is loaded once at the beginning and shared by all N blocks. + q = load_fn(Q_block_ptr, True, padded_head, "zero") + q = (q * qk_scale).to(Q_block_ptr.type.element_ty) + + # Here we compute how many full and masked blocks we have. + padded_block_k = n_extra_tokens != 0 + is_modulo_mn = not padded_block_k and (seqlen_q % BLOCK_M == 0) + if IS_CAUSAL: + # There are always at least BLOCK_M // BLOCK_N masked blocks. + # Additionally there might be one more due to dissimilar seqlens. + masked_blocks = BLOCK_M // BLOCK_N + (not is_modulo_mn) + else: + # Padding on Q does not need to be masked in the FA loop. + masked_blocks = padded_block_k + # if IS_CAUSAL, not is_modulo_mn does not always result in an additional block. + # In this case we might exceed n_blocks so pick the min. + masked_blocks = min(masked_blocks, n_blocks) + n_full_blocks = n_blocks - masked_blocks + block_min = 0 + block_max = n_blocks * BLOCK_N + # Compute for full blocks. Here we set causal to false regardless of its actual + # value because there is no masking. Similarly we do not need padding. + if n_full_blocks > 0: + block_max = (n_blocks - masked_blocks) * BLOCK_N + acc, l_i, m_i = _attn_fwd_inner( + acc, l_i, m_i, q, K_block_ptr, V_block_ptr, + start_m, seqlen_k, + dropout_p, philox_seed, batch_philox_offset, encoded_softmax_block_ptr, + # _, _, offs_n_causal, masked_blocks, n_extra_tokens, _ + block_min, block_max, 0, 0, 0, bias_ptr, + # IS_CAUSAL, .... + False, BLOCK_M, BLOCK_DMODEL, BLOCK_N, offs_m, offs_n, + # _, MASK_STEPS, ... + PRE_LOAD_V, False, ENABLE_DROPOUT, RETURN_ENCODED_SOFTMAX, padded_head + ) + block_min = block_max + block_max = n_blocks * BLOCK_N + + tl.debug_barrier() + # Remaining blocks, if any, are full / not masked. + if (masked_blocks > 0): + if IS_CAUSAL: + offs_n_causal = offs_n + (seqlen_q - seqlen_k) + else: + offs_n_causal = 0 + K_block_ptr = tl.advance(K_block_ptr, (0, n_full_blocks*BLOCK_N)) + V_block_ptr = tl.advance(V_block_ptr, (n_full_blocks*BLOCK_N, 0)) + if bias_ptr is not None: + bias_ptr = tl.advance(bias_ptr, (0, n_full_blocks*BLOCK_N)) + if RETURN_ENCODED_SOFTMAX: + encoded_softmax_block_ptr = tl.advance(encoded_softmax_block_ptr, + (0, n_full_blocks)) + acc, l_i, m_i = _attn_fwd_inner( + acc, l_i, m_i, q, K_block_ptr, V_block_ptr, + start_m, seqlen_k, + dropout_p, philox_seed, batch_philox_offset, encoded_softmax_block_ptr, + block_min, block_max, offs_n_causal, masked_blocks, n_extra_tokens, bias_ptr, + IS_CAUSAL, BLOCK_M, BLOCK_DMODEL, BLOCK_N, offs_m, offs_n, + # _, MASK_STEPS, ... + PRE_LOAD_V, True, ENABLE_DROPOUT, RETURN_ENCODED_SOFTMAX, padded_head + ) + # epilogue + acc = acc / l_i[:, None] + if ENABLE_DROPOUT: + acc = acc / (1 - dropout_p) + # If seqlen_q > seqlen_k but the delta is not a multiple of BLOCK_M, + # then we have one block with a row of all NaNs which come from computing + # softmax over a row of all -infs (-inf - inf = NaN). We check for that here + # and store 0s where there are NaNs as these rows should've been zeroed out. + end_m_idx = (start_m + 1) * BLOCK_M + start_m_idx = start_m * BLOCK_M + causal_start_idx = seqlen_q - seqlen_k + acc = acc.to(Out.type.element_ty) + if IS_CAUSAL: + if causal_start_idx > start_m_idx and causal_start_idx < end_m_idx: + out_mask_boundary = tl.full((BLOCK_DMODEL,), causal_start_idx, dtype=tl.int32) + mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M) + out_ptrs_mask = mask_m_offsets[:, None] >= out_mask_boundary[None, :] + z = 0.0 + acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty)) + # write back LSE + l_ptrs = L + off_z * hq * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m + # If seqlen_q not multiple of BLOCK_M, we need to mask out the last few rows. + # This is only true for the last M block. For others, overflow_size will be -ve + overflow_size = end_m_idx - seqlen_q + if overflow_size > 0: + boundary = tl.full((BLOCK_M,), BLOCK_M - overflow_size, dtype=tl.int32) + # This is a > check because mask being 0 blocks the store. + l_ptrs_mask = boundary > tl.arange(0, BLOCK_M) + tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask) + else: + tl.store(l_ptrs, m_i + tl.math.log2(l_i)) + + # write back O + o_offset = off_z * stride_oz + cu_seqlens_q_start * stride_om + off_h_q * stride_oh + O_block_ptr = tl.make_block_ptr( + base=Out + o_offset, + shape=(seqlen_q, ACTUAL_BLOCK_DMODEL), + strides=(stride_om, stride_on), + offsets=(start_m * BLOCK_M, 0), + block_shape=(BLOCK_M, BLOCK_DMODEL), + order=(1, 0) + ) + # Need boundary check on this to make sure the padding from the + # Q and KV tensors in both dims are not part of what we store back. + # TODO: Do the boundary check optionally. + tl.store(O_block_ptr, acc, boundary_check=(0,1)) + +def check_args(q, k, v, o, max_seqlens): + assert q.dim() == k.dim() and q.dim() == v.dim() + assert q.dim() == 4 + batch, nheads_q, seqlen_q, head_size = q.shape + _, nheads_k, seqlen_k, _ = k.shape + assert max_seqlens > 0 + assert k.shape == v.shape + assert q.shape[-1] == k.shape[-1] and q.shape[-1] == v.shape[-1] + # TODO: Change assert if we support qkl f8 and v f16 + assert q.dtype == k.dtype and q.dtype == v.dtype + # TODO: Fix assert to check head size <=256 once supported + assert head_size <= 128 + assert o.shape == q.shape + assert (nheads_q % nheads_k) == 0 + +class _attention(torch.autograd.Function): + @staticmethod + def forward(ctx, q, k, v, o, metadata, causal=False, sm_scale=1.0, bias=None): + if o is None: + o = torch.empty_like(q, dtype=v.dtype) + check_args(q, k, v, o, metadata.max_seq_len) + + batch, seqlen_q, nheads_q, head_size = q.shape + _, seqlen_k, nheads_k, _ = k.shape + q_strides = (q.stride(0), q.stride(2), q.stride(1), q.stride(3)) + k_strides = (k.stride(0), k.stride(2), k.stride(1), k.stride(3)) + v_strides = (v.stride(0), v.stride(2), v.stride(1), v.stride(3)) + o_strides = (o.stride(0), o.stride(2), o.stride(1), o.stride(3)) + + # Get closest power of 2 over or equal to 32. + unpadded_head_dims = {32, 64, 128} + if head_size not in unpadded_head_dims: + padded_d_model = None + for i in unpadded_head_dims: + if i > head_size: + padded_d_model = i + break + assert padded_d_model is not None + else: + padded_d_model = head_size + + + grid = lambda META: ( + triton.cdiv(metadata.max_seq_len, META['BLOCK_M']), + nheads_q, + batch + ) + + encoded_softmax = None + + M = torch.empty((batch, nheads_q, metadata.max_seq_len), device=q.device, dtype=torch.float32) + + # Seed the RNG so we get reproducible results for testing. + philox_seed = 0x1BF52 + philox_offset = 0x1D4B42 + + if bias is not None: + bias_strides = (bias.stride(0), bias.stride(1), + bias.stride(2), bias.stride(3)) + else: + bias_strides = (0,0,0,0) + + attn_fwd[grid]( + q, k, v, bias, sm_scale, M, o, + *q_strides, *k_strides, *v_strides, *o_strides, *bias_strides, + None, None, + dropout_p=0.0, + philox_seed=philox_seed, + philox_offset_base=philox_offset, + encoded_softmax=encoded_softmax, + hq=nheads_q, hk=nheads_k, + ACTUAL_BLOCK_DMODEL=head_size, + MAX_SEQLENS_Q=metadata.max_seq_len, + MAX_SEQLENS_K=metadata.max_seq_len, + IS_CAUSAL=causal, + VARLEN=False, + BLOCK_DMODEL=padded_d_model, + BIAS_TYPE=0 if bias is None else 1, + ENABLE_DROPOUT=False, + RETURN_ENCODED_SOFTMAX=False + ) + + ctx.save_for_backward(q, k, v, o, M) + ctx.grid = grid + ctx.sm_scale = sm_scale + ctx.BLOCK_DMODEL = head_size + ctx.causal = causal + ctx.dropout_p = 0.0 + ctx.philox_seed = philox_seed + ctx.philox_offset = philox_offset + ctx.encoded_softmax = encoded_softmax + ctx.return_encoded_softmax = False + return o, encoded_softmax + +attention = _attention.apply From b5ebb418e991b17015cc68f2b929e04ff2dd6494 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Fri, 15 Mar 2024 19:09:56 +0000 Subject: [PATCH 141/159] Skipping certain cache tests when using fp8 cache with e5m2 type. These were only initially enabled in light of the support for e4m3 --- tests/kernels/test_cache.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 36af1614546e7..423f0a884fb93 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -118,6 +118,8 @@ def test_reshape_and_cache( device: int, kv_cache_dtype: str, ) -> None: + if not is_hip() and kv_cache_dtype == "fp8": + pytest.skip() # This test is not tuned for e5m2 cuda precision random.seed(seed) torch.random.manual_seed(seed) if torch.cuda.is_available(): @@ -214,7 +216,9 @@ def test_swap_blocks( kv_cache_dtype: str, ) -> None: if kv_cache_dtype == "fp8" and "cpu" in direction: - return + pytest.skip() + if not is_hip() and kv_cache_dtype == "fp8": + pytest.skip() # This test is not tuned for e5m2 cuda precision random.seed(seed) torch.random.manual_seed(seed) if torch.cuda.is_available(): From 45d591276869a29927941ecd3b5781b030b806fa Mon Sep 17 00:00:00 2001 From: AdrianAbeyta Date: Fri, 15 Mar 2024 19:20:02 +0000 Subject: [PATCH 142/159] Fix yapf ci error --- tests/kernels/test_cache.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 423f0a884fb93..4a4537c8abe9a 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -119,7 +119,7 @@ def test_reshape_and_cache( kv_cache_dtype: str, ) -> None: if not is_hip() and kv_cache_dtype == "fp8": - pytest.skip() # This test is not tuned for e5m2 cuda precision + pytest.skip() # This test is not tuned for e5m2 cuda precision random.seed(seed) torch.random.manual_seed(seed) if torch.cuda.is_available(): @@ -218,7 +218,7 @@ def test_swap_blocks( if kv_cache_dtype == "fp8" and "cpu" in direction: pytest.skip() if not is_hip() and kv_cache_dtype == "fp8": - pytest.skip() # This test is not tuned for e5m2 cuda precision + pytest.skip() # This test is not tuned for e5m2 cuda precision random.seed(seed) torch.random.manual_seed(seed) if torch.cuda.is_available(): From be708d0d10284571f52074d338e95b859e275dfe Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Mon, 18 Mar 2024 20:23:32 +0000 Subject: [PATCH 143/159] Removed gradlib and its tuned gemm in favor of tunable ops --- .gitignore | 1 + gradlib/csrc/grad_funcs.cu | 413 ------------ gradlib/csrc/hipbsolgemm.cu | 610 ------------------ gradlib/csrc/rocsolgemm.cu | 563 ---------------- gradlib/gemm_runner.py | 62 -- gradlib/gemm_tuner.py | 92 --- gradlib/gradlib/GemmTuner.py | 208 ------ gradlib/mm_test.py | 234 ------- gradlib/setup.py | 136 ---- run.sh | 32 - run_70b.sh | 98 --- run_70b_fast.sh | 65 -- run_llama2.sh | 98 --- vllm/config.py | 2 +- vllm/engine/ray_utils.py | 24 +- vllm/model_executor/layers/linear.py | 9 +- vllm/model_executor/layers/sampler.py | 3 +- vllm/model_executor/layers/tuned_gemm.py | 111 ---- .../parallel_utils/communication_op.py | 26 +- vllm/worker/model_runner.py | 3 +- vllm/worker/worker.py | 31 +- 21 files changed, 24 insertions(+), 2797 deletions(-) delete mode 100644 gradlib/csrc/grad_funcs.cu delete mode 100644 gradlib/csrc/hipbsolgemm.cu delete mode 100644 gradlib/csrc/rocsolgemm.cu delete mode 100644 gradlib/gemm_runner.py delete mode 100644 gradlib/gemm_tuner.py delete mode 100644 gradlib/gradlib/GemmTuner.py delete mode 100644 gradlib/mm_test.py delete mode 100644 gradlib/setup.py delete mode 100755 run.sh delete mode 100755 run_70b.sh delete mode 100755 run_70b_fast.sh delete mode 100755 run_llama2.sh delete mode 100644 vllm/model_executor/layers/tuned_gemm.py diff --git a/.gitignore b/.gitignore index b5195629e5cf3..b1513ef0ddb0c 100644 --- a/.gitignore +++ b/.gitignore @@ -181,6 +181,7 @@ _build/ # hip files generated by PyTorch *.hip *_hip* +hip_compat.h # Benchmark dataset *.json diff --git a/gradlib/csrc/grad_funcs.cu b/gradlib/csrc/grad_funcs.cu deleted file mode 100644 index f6498fb2a3ba7..0000000000000 --- a/gradlib/csrc/grad_funcs.cu +++ /dev/null @@ -1,413 +0,0 @@ -// #ifdef __gfx908__ -// // Uncomment ifdef and endif only if you need to undef the HIP_HALF ops below just for gfx908 and not for others -// // below lines enable hip float to half conversion which are disabled by default in hip_fp16.h -// #undef __HIP_NO_HALF_OPERATORS__ -// #undef __HIP_NO_HALF_CONVERSIONS__ -// #endif - -#include -#include -#include -#include -#include -#include -#include -#include -// #include -#include -#include -#include -#include - -#include -//#include -#include - -#include -#include -#include -#include -#include -#include -#include "nvToolsExt.h" - -// #ifdef USE_ROCM -// #define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR) -// #define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242) -// #endif - -// #ifdef __HIP_PLATFORM_HCC__ -// #define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR) -// #define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242) -// #if USE_GEMM_FLAGS_FP16_ALT_IMPL -// #ifdef ROCM_BACKWARD_PASS_GUARD -// flag = at::BackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0; -// #endif -// #endif -// #endif - -#ifndef CHECK_HIP_ERROR -#define CHECK_HIP_ERROR(error) \ - if(error != hipSuccess) \ - { \ - fprintf(stderr, \ - "Hip error: '%s'(%d) at %s:%d\n", \ - hipGetErrorString(error), \ - error, \ - __FILE__, \ - __LINE__); \ - exit(EXIT_FAILURE); \ - } -#endif - -#ifndef CHECK_HIPBLAS_ERROR -#define CHECK_HIPBLAS_ERROR(error) \ - if(error != HIPBLAS_STATUS_SUCCESS) \ - { \ - fprintf(stderr, \ - "hipBLAS error: '%s'(%d) at %s:%d\n", \ - hipblasStatusToString(error), \ - error, \ - __FILE__, \ - __LINE__); \ - exit(EXIT_FAILURE); \ - } -#endif - -namespace { - /*thread_local*/ cudaStream_t weight_stream; - // BUG: DLM has event and stream on different devices error - // In multi-GPU scenerio, do names defined in this namespace exist on all devices? - // C++ keyword: thread_local <- maybe this can help? - /*thread_local*/ cudaEvent_t event; - - // hipBLASLt - hipblasLtHandle_t hipblaslt_handle; - hipblasLtMatmulPreference_t preference; - uint64_t workspace_size = 32*1024*1024; - //uint64_t workspace_size = 0; - void* d_workspace; - int request_solutions = 1; - int returnedAlgoCount = 0; - - struct MatMulConfig { - hipblasOperation_t op_A; - hipblasOperation_t op_B; - int M; - int N; - int K; - hipblasDatatype_t dtype; - - friend auto operator<(const MatMulConfig& left, const MatMulConfig& right) -> bool { - return std::tie(left.op_A, left.op_B, left.M, left.N, left.K, left.dtype) < std::tie(right.op_A, right.op_B, right.M, right.N, right.K, right.dtype); - } - }; - - // std::map, std::vector> heuristic_map; - std::map heuristic_map; - - hipEvent_t start, stop; - int bench_iters { 1 }; - int warmup_iters { 1 }; - - bool cout_print = true; -} - -///////////////////////////////////////////////////////////////////////////////////////////////////////// -/** - * hipBLASLt GEMM call -*/ -hipblasStatus_t hipblasLtMatmul_wrapper( - hipblasLtHandle_t handle, - hipblasOperation_t op_A, - hipblasOperation_t op_B, - int m, int n, int k, - const void *alpha, - const void *a, - int lda, - const void *b, - int ldb, - const void *beta, - void *c, - int ldc, - hipblasDatatype_t dtype, - hipStream_t &stream) -{ - // TODO: flag is not supported for hipblasLt yet - int flag { 0 }; - if (dtype == HIPBLAS_R_16F) { - // use fp16 alt impl for MI200 - // https://pytorch.org/docs/stable/notes/numerical_accuracy.html#reduced-precision-fp16-and-bf16-gemms-and-convolutions-on-amd-instinct-mi200-devices - flag = rocblas_gemm_flags_fp16_alt_impl; - } - - nvtxRangePushA("hipBLASLt variables creation"); - hipblasLtMatrixLayout_t matA, matB, matC; - hipblasLtMatmulDesc_t matmul; - if (op_A == HIPBLAS_OP_N) { - CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, m, k, lda)); - } else { - CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, k, m, lda)); - } - if (op_B == HIPBLAS_OP_N) { - CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, k, n, ldb)); - } else { - CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, n, k, ldb)); - } - CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matC, dtype, m, n, ldc)); - CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescCreate(&matmul, HIPBLASLT_COMPUTE_F32, HIPBLAS_R_32F)); - CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute( - matmul, HIPBLASLT_MATMUL_DESC_TRANSA, &op_A, sizeof(int32_t))); - CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute( - matmul, HIPBLASLT_MATMUL_DESC_TRANSB, &op_B, sizeof(int32_t))); - nvtxRangePop(); - - // if heuristic does not exist in the map, do search and push into the map - auto gemm_key { MatMulConfig { op_A, op_B, m, n, k, dtype } }; - if (heuristic_map.count(gemm_key) <= 0) { - nvtxRangePushA("hipblasLtMatmulAlgoGetHeuristic"); - if (cout_print) { - std::cout << (op_A == HIPBLAS_OP_N ? "N" : "T") << (op_B == HIPBLAS_OP_N ? "N" : "T") - << " (" << m << ", " << n << ", " << k << "), dtype: " << dtype - << ", (lda, ldb, ldc): (" << lda << ", " << ldb << ", " << ldc << "), " << std::endl; - } - std::vector heuristicResult(request_solutions); - CHECK_HIPBLAS_ERROR(hipblasLtMatmulAlgoGetHeuristic( - handle, matmul, matA, matB, matC, matC, - preference, request_solutions, heuristicResult.data(), &returnedAlgoCount)); - if((returnedAlgoCount != request_solutions) && cout_print) { - std::cout << "less solution found! request: " << request_solutions - << ", found: " << returnedAlgoCount << std::endl; - } - - if (returnedAlgoCount == 1) { - heuristic_map[gemm_key] = heuristicResult[0]; - } else { - // benchmark requested solutions and pick best one - int bestIndex { -1 }; - double bestMs { std::numeric_limits::max() }; - for (int sol { 0 }; sol < returnedAlgoCount; ++sol) { - // warm up - for (int iter { 0 }; iter < warmup_iters; ++iter) { - CHECK_HIPBLAS_ERROR(hipblasLtMatmul(handle, matmul, - alpha, - a, matA, - b, matB, - beta, - c, matC, - c, matC, // In case beta != 0, these runs can overwrite the values in c - // since c and d are the same - // TODO: allocates separate d memory for these runs - &heuristicResult[sol].algo, - d_workspace, workspace_size, - stream)); - } - // performance measuring - double eventMs; - CHECK_HIP_ERROR(hipEventRecord(start, stream)); - for (int iter { 0 }; iter < bench_iters; ++iter) { - CHECK_HIPBLAS_ERROR(hipblasLtMatmul(handle, matmul, - alpha, - a, matA, - b, matB, - beta, - c, matC, - c, matC, // In case beta != 0, these runs can overwrite the values in c - // since c and d are the same - // TODO: allocates separate d memory for these runs - &heuristicResult[sol].algo, - d_workspace, workspace_size, - stream)); - } - CHECK_HIP_ERROR(hipEventRecord(stop, stream)); - CHECK_HIP_ERROR(hipEventSynchronize(stop)); - float temp; - CHECK_HIP_ERROR(hipEventElapsedTime(&temp, start, stop)); - eventMs = double(temp); - eventMs /= bench_iters; - - if (cout_print) { - std::cout << " Sol " << sol << ": average time per iter " << std::to_string(eventMs) << " ms"; - } - if (bestMs > eventMs) { - bestMs = eventMs; - bestIndex = sol; - if (cout_print) { - std::cout << " *" << std::endl; - } - } else { - if (cout_print) { - std::cout << std::endl; - } - } - } - heuristic_map[gemm_key] = heuristicResult[bestIndex]; - } - nvtxRangePop(); - } - - hipblasStatus_t status = hipblasLtMatmul(handle, matmul, - alpha, - a, matA, - b, matB, - beta, - c, matC, - c, matC, - &heuristic_map[gemm_key].algo, - d_workspace, workspace_size, - stream); - - nvtxRangePushA("hipBLASLt variables deletion"); - CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescDestroy(matmul)); - CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matA)); - CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matB)); - CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matC)); - nvtxRangePop(); - - return status; -} - -///////////////////////////////////////////////////////////////////////////////////////////////////////// -torch::Tensor hipBLASLtMm_( - const torch::Tensor& mat1, - const torch::Tensor& mat2) -{ - auto mat1_strides { mat1.strides() }; - auto mat2_strides { mat2.strides() }; - auto mat1_sizes { mat1.sizes() }; - auto mat2_sizes { mat2.sizes() }; - // std::cout << " | mat1 info: size: " << mat1_sizes << " stride: " << mat1_strides << std::endl - // << " | mat2 info: size: " << mat2_sizes << " stride: " << mat2_strides << std::endl; - - TORCH_CHECK(mat1.dim() == 2 && mat2.dim() == 2, "tensors must be 2-D"); - TORCH_CHECK( - mat1.dtype() == mat2.dtype(), - "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype() - ); - TORCH_CHECK(mat1_sizes[1] == mat2_sizes[0], "mat1 dim 1 must match mat2 dim 0"); - - auto abcType { mat1.options().dtype() }; - auto options { at::TensorOptions().dtype(abcType).device(at::kCUDA) }; - auto result { torch::empty({ mat1_sizes[0], mat2_sizes[1] }, options) }; - // std::cout << " | result info: size: " << result.sizes() << " stride: " << result.strides() << std::endl; - - bool transpose_result = true; - bool transpose_mat1; - bool transpose_mat2; - if ((mat2_strides[0] == 1) && (mat2_strides[1] >= std::max(1, mat2_sizes[0]))) { - transpose_mat2 = false; - } else if ((mat2_strides[1] == 1) && (mat2_strides[0] >= std::max(1, mat2_sizes[1]))) { - transpose_mat2 = true; - } else { - assert(false && "unusual strides detected, may need to clone a contiguous tensor"); - } - if ((mat1_strides[0] == 1) && (mat1_strides[1] >= std::max(1, mat1_sizes[0]))) { - transpose_mat1 = false; - } else if ((mat1_strides[1] == 1) && (mat1_strides[0] >= std::max(1, mat1_sizes[1]))) { - transpose_mat1 = true; - } else { - assert(false && "unusual strides detected, may need to clone a contiguous tensor"); - } - - if (transpose_result) { - bool tmp = transpose_mat1; - transpose_mat1 = !transpose_mat2; - transpose_mat2 = !tmp; - mat1_strides = mat2.strides(); - mat2_strides = mat1.strides(); - mat1_sizes = mat2.sizes(); - mat2_sizes = mat1.sizes(); - } - // std::cout << " | transpose_result: " << (transpose_result ? "true" : "false") << std::endl - // << " | transpose_A: " << (transpose_mat1 ? "true" : "false") << std::endl - // << " | transpose_B: " << (transpose_mat2 ? "true" : "false") << std::endl; - // std::cout << " | A matrix: size: " << mat1_sizes << " stride: " << mat1_strides << std::endl - // << " | B matrix: size: " << mat2_sizes << " stride: " << mat2_strides << std::endl; - - float one { 1.0f }; - float zero { 0.0f }; - int64_t m = mat1_sizes[transpose_result ? 1 : 0]; - int64_t k = mat1_sizes[transpose_result ? 0 : 1]; - int64_t n = mat2_sizes[transpose_result ? 0 : 1]; - int64_t mat1_ld = mat1_strides[(transpose_mat1 == transpose_result) ? 1 : 0]; - int64_t mat2_ld = mat2_strides[(transpose_mat2 == transpose_result) ? 1 : 0]; - int64_t result_ld = result.stride(transpose_result ? 0 : 1); - // std::cout << " | (m, n, k): " << m << ", " << n << ", " << k << std::endl - // << " | (lda, ldb, ldc): " << mat1_ld << ", " << mat2_ld << ", " << result_ld << std::endl; - - int flag { 0 }; - hipblasDatatype_t hipblasType; - if (abcType == at::kHalf) { - hipblasType = HIPBLAS_R_16F; - } else if (abcType == at::kBFloat16) { - hipblasType = HIPBLAS_R_16B; - } else if (abcType == at::kFloat) { - hipblasType = HIPBLAS_R_32F; - } else { - assert(false && "Wrong datatype!"); - } - - void *ptrA { static_cast((transpose_result ? mat2 : mat1).data_ptr()) }; - void *ptrB { static_cast((transpose_result ? mat1 : mat2).data_ptr()) }; - void *ptrC { static_cast(result.data_ptr()) }; - - auto current_stream { torch::hip::getCurrentHIPStream().stream() }; - - CHECK_HIPBLAS_ERROR(hipblasLtMatmul_wrapper( - hipblaslt_handle, - transpose_mat1 ? HIPBLAS_OP_T : HIPBLAS_OP_N, - transpose_mat2 ? HIPBLAS_OP_T : HIPBLAS_OP_N, - m, n, k, - &one, - ptrA, mat1_ld, - ptrB, mat2_ld, - &zero, - ptrC, result_ld, - hipblasType, - current_stream)); - - return result; -} - -///////////////////////////////////////////////////////////////////////////////////////////////////////// - -void create_extension() -{ - CHECK_HIP_ERROR(hipStreamCreate(&weight_stream)); - CHECK_HIP_ERROR(hipEventCreateWithFlags(&event, cudaEventDisableTiming)); - - // hipBLASLt - CHECK_HIPBLAS_ERROR(hipblasLtCreate(&hipblaslt_handle)); - CHECK_HIP_ERROR(hipMalloc(&d_workspace, workspace_size)); - CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceCreate(&preference)); - CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceSetAttribute( - preference, HIPBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_size, sizeof(workspace_size))); - - CHECK_HIP_ERROR(hipEventCreate(&start)); - CHECK_HIP_ERROR(hipEventCreate(&stop)); -} - -///////////////////////////////////////////////////////////////////////////////////////////////////////// - -void destroy_extension() -{ - CHECK_HIP_ERROR(hipStreamDestroy(weight_stream)); - CHECK_HIP_ERROR(hipEventDestroy(event)); - - // hipBLASLt - CHECK_HIPBLAS_ERROR(hipblasLtDestroy(hipblaslt_handle)); - CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceDestroy(preference)); - CHECK_HIP_ERROR(hipFree(d_workspace)); - - CHECK_HIP_ERROR(hipEventDestroy(start)); - CHECK_HIP_ERROR(hipEventDestroy(stop)); -} - -///////////////////////////////////////////////////////////////////////////////////////////////////////// - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) -{ - m.def("create_extension", &create_extension, "create_extension"); - m.def("destroy_extension", &destroy_extension, "destroy_extension"); - m.def("mm", &hipBLASLtMm_, "mm"); -} diff --git a/gradlib/csrc/hipbsolgemm.cu b/gradlib/csrc/hipbsolgemm.cu deleted file mode 100644 index bf15fb1297667..0000000000000 --- a/gradlib/csrc/hipbsolgemm.cu +++ /dev/null @@ -1,610 +0,0 @@ -// #ifdef __gfx908__ -// // Uncomment ifdef and endif only if you need to undef the HIP_HALF ops below just for gfx908 and not for others -// // below lines enable hip float to half conversion which are disabled by default in hip_fp16.h -// #undef __HIP_NO_HALF_OPERATORS__ -// #undef __HIP_NO_HALF_CONVERSIONS__ -// #endif - -#include -#include -#include -#include -#include -#include -#include -#include -// #include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include "nvToolsExt.h" - -//#include - - -// #ifdef USE_ROCM -// #define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR) -// #define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242) -// #endif - -// #ifdef __HIP_PLATFORM_HCC__ -// #define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR) -// #define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242) -// #if USE_GEMM_FLAGS_FP16_ALT_IMPL -// #ifdef ROCM_BACKWARD_PASS_GUARD -// flag = at::BackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0; -// #endif -// #endif -// #endif - -#ifndef CHECK_HIP_ERROR -#define CHECK_HIP_ERROR(error) \ - if(error != hipSuccess) \ - { \ - fprintf(stderr, \ - "Hip error: '%s'(%d) at %s:%d\n", \ - hipGetErrorString(error), \ - error, \ - __FILE__, \ - __LINE__); \ - exit(EXIT_FAILURE); \ - } -#endif - -#ifndef CHECK_HIPBLAS_ERROR -#define CHECK_HIPBLAS_ERROR(error) \ - if(error != HIPBLAS_STATUS_SUCCESS) \ - { \ - fprintf(stderr, \ - "hipBLAS error: '%s'(%d) at %s:%d\n", \ - hipblasStatusToString(error), \ - error, \ - __FILE__, \ - __LINE__); \ - exit(EXIT_FAILURE); \ - } -#endif - -namespace { - /*thread_local*/ cudaStream_t weight_stream; - // BUG: DLM has event and stream on different devices error - // In multi-GPU scenerio, do names defined in this namespace exist on all devices? - // C++ keyword: thread_local <- maybe this can help? - /*thread_local*/ cudaEvent_t event; - - // hipBLASLt - hipblasLtHandle_t hipblaslt_handle; - hipblasLtMatmulPreference_t preference; - size_t workspace_size = 2*128*1024*1024; - //uint64_t workspace_size = 0; - void* d_workspace; - int request_solutions = 1; - int returnedAlgoCount = 0; - - struct MatMulConfig { - hipblasOperation_t op_A; - hipblasOperation_t op_B; - int M; - int N; - int K; - hipDataType dtype; - - friend auto operator<(const MatMulConfig& left, const MatMulConfig& right) -> bool { - return std::tie(left.op_A, left.op_B, left.M, left.N, left.K, left.dtype) < std::tie(right.op_A, right.op_B, right.M, right.N, right.K, right.dtype); - } - }; - - // std::map, std::vector> heuristic_map; - std::map heuristic_map; - - hipEvent_t start, stop; - int bench_iters { 1 }; - int warmup_iters { 1 }; - - bool cout_print = false; - - //std::vector heuristicResult; -} - -//find all hipblaslt solutions for given gemm problem -std::vector hipblasLtMatmul_findallsols_wrapper( - hipblasLtHandle_t handle, - hipblasOperation_t op_A, - hipblasOperation_t op_B, - int m, int n, int k, - const void *alpha, - const void *a, - int lda, - const void *b, - int ldb, - const void *beta, - void *c, - int ldc, - hipDataType dtype, - hipStream_t &stream) -{ - int flag { 0 }; - hipblasLtMatrixLayout_t matA, matB, matC; - hipblasLtMatmulDesc_t matmul; - if (op_A == HIPBLAS_OP_N) { - CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, m, k, lda)); - } else { - CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, k, m, lda)); - } - if (op_B == HIPBLAS_OP_N) { - CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, k, n, ldb)); - } else { - CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, n, k, ldb)); - } - CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matC, dtype, m, n, ldc)); - CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescCreate(&matmul, HIPBLAS_COMPUTE_32F, HIP_R_32F)); - CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute( - matmul, HIPBLASLT_MATMUL_DESC_TRANSA, &op_A, sizeof(int32_t))); - CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute( - matmul, HIPBLASLT_MATMUL_DESC_TRANSB, &op_B, sizeof(int32_t))); - - //std::vector heuristicResult(10); - //CHECK_HIPBLAS_ERROR(hipblasLtMatmulAlgoGetHeuristic( - // handle, matmul, matA, matB, matC, matC, - // preference, 10, heuristicResult.data(), &returnedAlgoCount)); - std::vector heuristicResult; - CHECK_HIPBLAS_ERROR(hipblaslt_ext::getAllAlgos(handle, hipblaslt_ext::GemmType::HIPBLASLT_GEMM, - op_A, - op_B, - dtype, - dtype, - dtype, - dtype, - HIPBLAS_COMPUTE_32F, - heuristicResult)); - - std::vector algoIndex; - int returned_algo_count = heuristicResult.size(); - //for (int i = 0; i < returnedAlgoCount; i++) { - for (int i = 0; i < returned_algo_count; i++) { - auto algo = heuristicResult[i].algo; - size_t ret_workspace_size = 0; - auto status = hipblaslt_ext::matmulIsAlgoSupported(handle, matmul, - alpha, - matA, - matB, - beta, - matC, - matC, - algo, - ret_workspace_size - ); - if (status == HIPBLAS_STATUS_SUCCESS) { - if (ret_workspace_size heuristicResult(1); - if (solution_index<0) { - //nvtxRangePushA("hipblasLtMatmulAlgoGetHeuristic"); - std::cout << "Warning! HipbSolId Gemm Fallback Path used for solution index <0" << std::endl; - if (cout_print) { - std::cout << (op_A == HIPBLAS_OP_N ? "N" : "T") << (op_B == HIPBLAS_OP_N ? "N" : "T") - << " (" << m << ", " << n << ", " << k << "), dtype: " << dtype - << ", (lda, ldb, ldc): (" << lda << ", " << ldb << ", " << ldc << "), " << std::endl; - } - //std::vector heuristicResult(request_solutions); - CHECK_HIPBLAS_ERROR(hipblasLtMatmulAlgoGetHeuristic( - handle, matmul, matA, matB, matC, matC, - preference, request_solutions, heuristicResult.data(), &returnedAlgoCount)); - if((returnedAlgoCount != request_solutions) && cout_print) { - std::cout << "less solution found! request: " << request_solutions - << ", found: " << returnedAlgoCount << std::endl; - } - //heuristic_map[gemm_key] = heuristicResult[0]; -/* - if (returnedAlgoCount == 1) { - heuristic_map[gemm_key] = heuristicResult[0]; - } else { - // benchmark requested solutions and pick best one - int bestIndex { -1 }; - double bestMs { std::numeric_limits::max() }; - for (int sol { 0 }; sol < returnedAlgoCount; ++sol) { - // warm up - for (int iter { 0 }; iter < warmup_iters; ++iter) { - CHECK_HIPBLAS_ERROR(hipblasLtMatmul(handle, matmul, - alpha, - a, matA, - b, matB, - beta, - c, matC, - c, matC, // In case beta != 0, these runs can overwrite the values in c - // since c and d are the same - // TODO: allocates separate d memory for these runs - &heuristicResult[sol].algo, - d_workspace, workspace_size, - stream)); - } - // performance measuring - double eventMs; - CHECK_HIP_ERROR(hipEventRecord(start, stream)); - for (int iter { 0 }; iter < bench_iters; ++iter) { - CHECK_HIPBLAS_ERROR(hipblasLtMatmul(handle, matmul, - alpha, - a, matA, - b, matB, - beta, - c, matC, - c, matC, // In case beta != 0, these runs can overwrite the values in c - // since c and d are the same - // TODO: allocates separate d memory for these runs - &heuristicResult[sol].algo, - d_workspace, workspace_size, - stream)); - } - CHECK_HIP_ERROR(hipEventRecord(stop, stream)); - CHECK_HIP_ERROR(hipEventSynchronize(stop)); - float temp; - CHECK_HIP_ERROR(hipEventElapsedTime(&temp, start, stop)); - eventMs = double(temp); - eventMs /= bench_iters; - - if (cout_print) { - std::cout << " Sol " << sol << ": average time per iter " << std::to_string(eventMs) << " ms"; - } - if (bestMs > eventMs) { - bestMs = eventMs; - bestIndex = sol; - if (cout_print) { - std::cout << " *" << std::endl; - } - } else { - if (cout_print) { - std::cout << std::endl; - } - } - } - heuristic_map[gemm_key] = heuristicResult[bestIndex]; - } -*/ - //nvtxRangePop(); - } else { - std::vector algoIndex(1); - algoIndex[0]=solution_index; - //std::vector tmpAlgo; - CHECK_HIPBLAS_ERROR(hipblaslt_ext::getAlgosFromIndex(handle, algoIndex, heuristicResult)); - } - - //size_t ret_workspace_size = 0; - - //auto status1 = hipblaslt_ext::matmulIsAlgoSupported(handle, matmul, - // alpha, - // matA, - // matB, - // beta, - // matC, - // matC, - // heuristicResult[0].algo, - // ret_workspace_size - //); - //if (status1 == HIPBLAS_STATUS_SUCCESS) { - // std::cout << "Workspace size" << ret_workspace_size << std::endl; - - //} else { - // std::cout << "Algo not supported!!!" << std::endl; - - //} - hipblasStatus_t status = hipblasLtMatmul(handle, matmul, - alpha, - a, matA, - b, matB, - beta, - c, matC, - c, matC, - &heuristicResult[0].algo, - d_workspace, workspace_size, - stream); - - //nvtxRangePushA("hipBLASLt variables deletion"); - CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescDestroy(matmul)); - CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matA)); - CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matB)); - CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matC)); - //nvtxRangePop(); - - return status; -} -///////////////////////////////////////////////////////////////////////////////////////////////////////// -torch::Tensor HipbSolIdxBlas( - const torch::Tensor& mat1, - const torch::Tensor& mat2, - const int solution_index - ) -{ - auto mat1_strides { mat1.strides() }; - auto mat2_strides { mat2.strides() }; - auto mat1_sizes { mat1.sizes() }; - auto mat2_sizes { mat2.sizes() }; - // std::cout << " | mat1 info: size: " << mat1_sizes << " stride: " << mat1_strides << std::endl - // << " | mat2 info: size: " << mat2_sizes << " stride: " << mat2_strides << std::endl; - - TORCH_CHECK(mat1.dim() == 2 && mat2.dim() == 2, "tensors must be 2-D"); - TORCH_CHECK( - mat1.dtype() == mat2.dtype(), - "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype() - ); - TORCH_CHECK(mat1_sizes[1] == mat2_sizes[0], "mat1 dim 1 must match mat2 dim 0"); - - auto abcType { mat1.options().dtype() }; - auto options { at::TensorOptions().dtype(abcType).device(at::kCUDA) }; - auto result { torch::empty({ mat1_sizes[0], mat2_sizes[1] }, options) }; - // std::cout << " | result info: size: " << result.sizes() << " stride: " << result.strides() << std::endl; - - bool transpose_result = true; - bool transpose_mat1; - bool transpose_mat2; - if ((mat2_strides[0] == 1) && (mat2_strides[1] >= std::max(1, mat2_sizes[0]))) { - transpose_mat2 = false; - } else if ((mat2_strides[1] == 1) && (mat2_strides[0] >= std::max(1, mat2_sizes[1]))) { - transpose_mat2 = true; - } else { - assert(false && "unusual strides detected, may need to clone a contiguous tensor"); - } - if ((mat1_strides[0] == 1) && (mat1_strides[1] >= std::max(1, mat1_sizes[0]))) { - transpose_mat1 = false; - } else if ((mat1_strides[1] == 1) && (mat1_strides[0] >= std::max(1, mat1_sizes[1]))) { - transpose_mat1 = true; - } else { - assert(false && "unusual strides detected, may need to clone a contiguous tensor"); - } - - if (transpose_result) { - bool tmp = transpose_mat1; - transpose_mat1 = !transpose_mat2; - transpose_mat2 = !tmp; - mat1_strides = mat2.strides(); - mat2_strides = mat1.strides(); - mat1_sizes = mat2.sizes(); - mat2_sizes = mat1.sizes(); - } - // std::cout << " | transpose_result: " << (transpose_result ? "true" : "false") << std::endl - // << " | transpose_A: " << (transpose_mat1 ? "true" : "false") << std::endl - // << " | transpose_B: " << (transpose_mat2 ? "true" : "false") << std::endl; - // std::cout << " | A matrix: size: " << mat1_sizes << " stride: " << mat1_strides << std::endl - // << " | B matrix: size: " << mat2_sizes << " stride: " << mat2_strides << std::endl; - - float one { 1.0f }; - float zero { 0.0f }; - int64_t m = mat1_sizes[transpose_result ? 1 : 0]; - int64_t k = mat1_sizes[transpose_result ? 0 : 1]; - int64_t n = mat2_sizes[transpose_result ? 0 : 1]; - int64_t mat1_ld = mat1_strides[(transpose_mat1 == transpose_result) ? 1 : 0]; - int64_t mat2_ld = mat2_strides[(transpose_mat2 == transpose_result) ? 1 : 0]; - int64_t result_ld = result.stride(transpose_result ? 0 : 1); - // std::cout << " | (m, n, k): " << m << ", " << n << ", " << k << std::endl - // << " | (lda, ldb, ldc): " << mat1_ld << ", " << mat2_ld << ", " << result_ld << std::endl; - - hipDataType hipblasType; - if (abcType == at::kHalf) { - hipblasType = HIP_R_16F; - } else if (abcType == at::kBFloat16) { - hipblasType = HIP_R_16BF; - } else if (abcType == at::kFloat) { - hipblasType = HIP_R_32F; - } else { - assert(false && "Wrong datatype!"); - } - void *ptrA { static_cast((transpose_result ? mat2 : mat1).data_ptr()) }; - void *ptrB { static_cast((transpose_result ? mat1 : mat2).data_ptr()) }; - void *ptrC { static_cast(result.data_ptr()) }; - auto current_stream { torch::hip::getCurrentHIPStream().stream() }; - - CHECK_HIPBLAS_ERROR(hipblasLtMatmul_sol_wrapper( - hipblaslt_handle, - transpose_mat1 ? HIPBLAS_OP_T : HIPBLAS_OP_N, - transpose_mat2 ? HIPBLAS_OP_T : HIPBLAS_OP_N, - m, n, k, - &one, - ptrA, mat1_ld, - ptrB, mat2_ld, - &zero, - ptrC, result_ld, - hipblasType, - current_stream,solution_index)); - - return result; -} - -//find all hipblas solutions and return them to python land -std::vector HipbFindAllSolIdxBlas( - const torch::Tensor& mat1, - const torch::Tensor& mat2 - ) -{ - auto mat1_strides { mat1.strides() }; - auto mat2_strides { mat2.strides() }; - auto mat1_sizes { mat1.sizes() }; - auto mat2_sizes { mat2.sizes() }; - TORCH_CHECK(mat1.dim() == 2 && mat2.dim() == 2, "tensors must be 2-D"); - TORCH_CHECK( - mat1.dtype() == mat2.dtype(), - "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype() - ); - TORCH_CHECK(mat1_sizes[1] == mat2_sizes[0], "mat1 dim 1 must match mat2 dim 0"); - - auto abcType { mat1.options().dtype() }; - auto options { at::TensorOptions().dtype(abcType).device(at::kCUDA) }; - auto result { torch::empty({ mat1_sizes[0], mat2_sizes[1] }, options) }; - bool transpose_result = true; - bool transpose_mat1; - bool transpose_mat2; - if ((mat2_strides[0] == 1) && (mat2_strides[1] >= std::max(1, mat2_sizes[0]))) { - transpose_mat2 = false; - } else if ((mat2_strides[1] == 1) && (mat2_strides[0] >= std::max(1, mat2_sizes[1]))) { - transpose_mat2 = true; - } else { - assert(false && "unusual strides detected, may need to clone a contiguous tensor"); - } - if ((mat1_strides[0] == 1) && (mat1_strides[1] >= std::max(1, mat1_sizes[0]))) { - transpose_mat1 = false; - } else if ((mat1_strides[1] == 1) && (mat1_strides[0] >= std::max(1, mat1_sizes[1]))) { - transpose_mat1 = true; - } else { - assert(false && "unusual strides detected, may need to clone a contiguous tensor"); - } - if (transpose_result) { - bool tmp = transpose_mat1; - transpose_mat1 = !transpose_mat2; - transpose_mat2 = !tmp; - mat1_strides = mat2.strides(); - mat2_strides = mat1.strides(); - mat1_sizes = mat2.sizes(); - mat2_sizes = mat1.sizes(); - } - float one { 1.0f }; - float zero { 0.0f }; - int64_t m = mat1_sizes[transpose_result ? 1 : 0]; - int64_t k = mat1_sizes[transpose_result ? 0 : 1]; - int64_t n = mat2_sizes[transpose_result ? 0 : 1]; - int64_t mat1_ld = mat1_strides[(transpose_mat1 == transpose_result) ? 1 : 0]; - int64_t mat2_ld = mat2_strides[(transpose_mat2 == transpose_result) ? 1 : 0]; - int64_t result_ld = result.stride(transpose_result ? 0 : 1); - hipDataType hipblasType; - if (abcType == at::kHalf) { - hipblasType = HIP_R_16F; - } else if (abcType == at::kBFloat16) { - hipblasType = HIP_R_16BF; - } else if (abcType == at::kFloat) { - hipblasType = HIP_R_32F; - } else { - assert(false && "Wrong datatype!"); - } - void *ptrA { static_cast((transpose_result ? mat2 : mat1).data_ptr()) }; - void *ptrB { static_cast((transpose_result ? mat1 : mat2).data_ptr()) }; - void *ptrC { static_cast(result.data_ptr()) }; - auto current_stream { torch::hip::getCurrentHIPStream().stream() }; - - return hipblasLtMatmul_findallsols_wrapper( - hipblaslt_handle, - transpose_mat1 ? HIPBLAS_OP_T : HIPBLAS_OP_N, - transpose_mat2 ? HIPBLAS_OP_T : HIPBLAS_OP_N, - m, n, k, - &one, - ptrA, mat1_ld, - ptrB, mat2_ld, - &zero, - ptrC, result_ld, - hipblasType, - current_stream); - -} -///////////////////////////////////////////////////////////////////////////////////////////////////////// - -void hipb_create_extension() -{ - //CHECK_HIP_ERROR(hipStreamCreate(&weight_stream)); - //CHECK_HIP_ERROR(hipEventCreateWithFlags(&event, cudaEventDisableTiming)); - - // hipBLASLt - CHECK_HIPBLAS_ERROR(hipblasLtCreate(&hipblaslt_handle)); - CHECK_HIP_ERROR(hipMalloc(&d_workspace, workspace_size)); - CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceCreate(&preference)); - CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceSetAttribute( - preference, HIPBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_size, sizeof(workspace_size))); - - //CHECK_HIP_ERROR(hipEventCreate(&start)); - //CHECK_HIP_ERROR(hipEventCreate(&stop)); -} - -///////////////////////////////////////////////////////////////////////////////////////////////////////// - -void hipb_destroy_extension() -{ - //CHECK_HIP_ERROR(hipStreamDestroy(weight_stream)); - //CHECK_HIP_ERROR(hipEventDestroy(event)); - - // hipBLASLt - CHECK_HIPBLAS_ERROR(hipblasLtDestroy(hipblaslt_handle)); - CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceDestroy(preference)); - CHECK_HIP_ERROR(hipFree(d_workspace)); - - //CHECK_HIP_ERROR(hipEventDestroy(start)); - //CHECK_HIP_ERROR(hipEventDestroy(stop)); -} - -///////////////////////////////////////////////////////////////////////////////////////////////////////// - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) -{ - m.def("hipb_create_extension", &hipb_create_extension, "create_extension"); - m.def("hipb_destroy_extension", &hipb_destroy_extension, "destroy_extension"); - m.def("hipb_mm", &HipbSolIdxBlas, "mm"); - m.def("hipb_findallsols", &HipbFindAllSolIdxBlas, "hipblas_find_all_sols"); -} diff --git a/gradlib/csrc/rocsolgemm.cu b/gradlib/csrc/rocsolgemm.cu deleted file mode 100644 index d691fcac416a6..0000000000000 --- a/gradlib/csrc/rocsolgemm.cu +++ /dev/null @@ -1,563 +0,0 @@ -// #ifdef __gfx908__ -// // Uncomment ifdef and endif only if you need to undef the HIP_HALF ops below just for gfx908 and not for others -// // below lines enable hip float to half conversion which are disabled by default in hip_fp16.h -// #undef __HIP_NO_HALF_OPERATORS__ -// #undef __HIP_NO_HALF_CONVERSIONS__ -// #endif - -#define ROCBLAS_NO_DEPRECATED_WARNINGS -#define ROCBLAS_BETA_FEATURES_API - -#include -#include -#include -#include -#include -#include -#include -#include -// #include -#include -#include -#include -#include - -#include -//#include -#include - -#include -#include -#include -#include -#include -#include -#include "nvToolsExt.h" - -#include - - -// #ifdef USE_ROCM -// #define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR) -// #define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242) -// #endif - -// #ifdef __HIP_PLATFORM_HCC__ -// #define PYTORCH_ROCBLAS_VERSION_DECIMAL (ROCBLAS_VERSION_MAJOR * 100 + ROCBLAS_VERSION_MINOR) -// #define USE_GEMM_FLAGS_FP16_ALT_IMPL (PYTORCH_ROCBLAS_VERSION_DECIMAL >= 242) -// #if USE_GEMM_FLAGS_FP16_ALT_IMPL -// #ifdef ROCM_BACKWARD_PASS_GUARD -// flag = at::BackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0; -// #endif -// #endif -// #endif - -#ifndef CHECK_HIP_ERROR -#define CHECK_HIP_ERROR(error) \ - if(error != hipSuccess) \ - { \ - fprintf(stderr, \ - "Hip error: '%s'(%d) at %s:%d\n", \ - hipGetErrorString(error), \ - error, \ - __FILE__, \ - __LINE__); \ - exit(EXIT_FAILURE); \ - } -#endif - -#ifndef CHECK_HIPBLAS_ERROR -#define CHECK_HIPBLAS_ERROR(error) \ - if(error != HIPBLAS_STATUS_SUCCESS) \ - { \ - fprintf(stderr, \ - "hipBLAS error: '%s'(%d) at %s:%d\n", \ - hipblasStatusToString(error), \ - error, \ - __FILE__, \ - __LINE__); \ - exit(EXIT_FAILURE); \ - } -#endif - -namespace { - rocblas_handle r_handle; - - /*thread_local*/ cudaStream_t weight_stream; - // BUG: DLM has event and stream on different devices error - // In multi-GPU scenerio, do names defined in this namespace exist on all devices? - // C++ keyword: thread_local <- maybe this can help? - /*thread_local*/ cudaEvent_t event; - - // hipBLASLt - hipblasLtHandle_t hipblaslt_handle; - hipblasLtMatmulPreference_t preference; - uint64_t workspace_size = 32*1024*1024; - //uint64_t workspace_size = 0; - void* d_workspace; - int request_solutions = 1; - int returnedAlgoCount = 0; - - struct MatMulConfig { - hipblasOperation_t op_A; - hipblasOperation_t op_B; - int M; - int N; - int K; - hipblasDatatype_t dtype; - - friend auto operator<(const MatMulConfig& left, const MatMulConfig& right) -> bool { - return std::tie(left.op_A, left.op_B, left.M, left.N, left.K, left.dtype) < std::tie(right.op_A, right.op_B, right.M, right.N, right.K, right.dtype); - } - }; - - // std::map, std::vector> heuristic_map; - std::map heuristic_map; - - hipEvent_t start, stop; - int bench_iters { 1 }; - int warmup_iters { 1 }; - - bool cout_print = true; -} - -///////////////////////////////////////////////////////////////////////////////////////////////////////// -/** - * hipBLASLt GEMM call -*/ -/* -hipblasStatus_t hipblasLtMatmul_wrapper( - hipblasLtHandle_t handle, - hipblasOperation_t op_A, - hipblasOperation_t op_B, - int m, int n, int k, - const void *alpha, - const void *a, - int lda, - const void *b, - int ldb, - const void *beta, - void *c, - int ldc, - hipblasDatatype_t dtype, - hipStream_t &stream) -{ - // TODO: flag is not supported for hipblasLt yet - int flag { 0 }; - if (dtype == HIPBLAS_R_16F) { - // use fp16 alt impl for MI200 - // https://pytorch.org/docs/stable/notes/numerical_accuracy.html#reduced-precision-fp16-and-bf16-gemms-and-convolutions-on-amd-instinct-mi200-devices - flag = rocblas_gemm_flags_fp16_alt_impl; - } - - nvtxRangePushA("hipBLASLt variables creation"); - hipblasLtMatrixLayout_t matA, matB, matC; - hipblasLtMatmulDesc_t matmul; - if (op_A == HIPBLAS_OP_N) { - CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, m, k, lda)); - } else { - CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matA, dtype, k, m, lda)); - } - if (op_B == HIPBLAS_OP_N) { - CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, k, n, ldb)); - } else { - CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matB, dtype, n, k, ldb)); - } - CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutCreate(&matC, dtype, m, n, ldc)); - CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescCreate(&matmul, HIPBLASLT_COMPUTE_F32, HIPBLAS_R_32F)); - CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute( - matmul, HIPBLASLT_MATMUL_DESC_TRANSA, &op_A, sizeof(int32_t))); - CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescSetAttribute( - matmul, HIPBLASLT_MATMUL_DESC_TRANSB, &op_B, sizeof(int32_t))); - nvtxRangePop(); - - // if heuristic does not exist in the map, do search and push into the map - auto gemm_key { MatMulConfig { op_A, op_B, m, n, k, dtype } }; - if (heuristic_map.count(gemm_key) <= 0) { - nvtxRangePushA("hipblasLtMatmulAlgoGetHeuristic"); - if (cout_print) { - std::cout << (op_A == HIPBLAS_OP_N ? "N" : "T") << (op_B == HIPBLAS_OP_N ? "N" : "T") - << " (" << m << ", " << n << ", " << k << "), dtype: " << dtype - << ", (lda, ldb, ldc): (" << lda << ", " << ldb << ", " << ldc << "), " << std::endl; - } - std::vector heuristicResult(request_solutions); - CHECK_HIPBLAS_ERROR(hipblasLtMatmulAlgoGetHeuristic( - handle, matmul, matA, matB, matC, matC, - preference, request_solutions, heuristicResult.data(), &returnedAlgoCount)); - if((returnedAlgoCount != request_solutions) && cout_print) { - std::cout << "less solution found! request: " << request_solutions - << ", found: " << returnedAlgoCount << std::endl; - } - - if (returnedAlgoCount == 1) { - heuristic_map[gemm_key] = heuristicResult[0]; - } else { - // benchmark requested solutions and pick best one - int bestIndex { -1 }; - double bestMs { std::numeric_limits::max() }; - for (int sol { 0 }; sol < returnedAlgoCount; ++sol) { - // warm up - for (int iter { 0 }; iter < warmup_iters; ++iter) { - CHECK_HIPBLAS_ERROR(hipblasLtMatmul(handle, matmul, - alpha, - a, matA, - b, matB, - beta, - c, matC, - c, matC, // In case beta != 0, these runs can overwrite the values in c - // since c and d are the same - // TODO: allocates separate d memory for these runs - &heuristicResult[sol].algo, - d_workspace, workspace_size, - stream)); - } - // performance measuring - double eventMs; - CHECK_HIP_ERROR(hipEventRecord(start, stream)); - for (int iter { 0 }; iter < bench_iters; ++iter) { - CHECK_HIPBLAS_ERROR(hipblasLtMatmul(handle, matmul, - alpha, - a, matA, - b, matB, - beta, - c, matC, - c, matC, // In case beta != 0, these runs can overwrite the values in c - // since c and d are the same - // TODO: allocates separate d memory for these runs - &heuristicResult[sol].algo, - d_workspace, workspace_size, - stream)); - } - CHECK_HIP_ERROR(hipEventRecord(stop, stream)); - CHECK_HIP_ERROR(hipEventSynchronize(stop)); - float temp; - CHECK_HIP_ERROR(hipEventElapsedTime(&temp, start, stop)); - eventMs = double(temp); - eventMs /= bench_iters; - - if (cout_print) { - std::cout << " Sol " << sol << ": average time per iter " << std::to_string(eventMs) << " ms"; - } - if (bestMs > eventMs) { - bestMs = eventMs; - bestIndex = sol; - if (cout_print) { - std::cout << " *" << std::endl; - } - } else { - if (cout_print) { - std::cout << std::endl; - } - } - } - heuristic_map[gemm_key] = heuristicResult[bestIndex]; - } - nvtxRangePop(); - } - - hipblasStatus_t status = hipblasLtMatmul(handle, matmul, - alpha, - a, matA, - b, matB, - beta, - c, matC, - c, matC, - &heuristic_map[gemm_key].algo, - d_workspace, workspace_size, - stream); - - nvtxRangePushA("hipBLASLt variables deletion"); - CHECK_HIPBLAS_ERROR(hipblasLtMatmulDescDestroy(matmul)); - CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matA)); - CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matB)); - CHECK_HIPBLAS_ERROR(hipblasLtMatrixLayoutDestroy(matC)); - nvtxRangePop(); - - return status; -} -*/ -///////////////////////////////////////////////////////////////////////////////////////////////////////// -std::vector RocFindAllSolIdxBlas( - const torch::Tensor& mat1, - const torch::Tensor& mat2 - ) -{ - auto mat1_strides { mat1.strides() }; - auto mat2_strides { mat2.strides() }; - auto mat1_sizes { mat1.sizes() }; - auto mat2_sizes { mat2.sizes() }; - - TORCH_CHECK(mat1.dim() == 2 && mat2.dim() == 2, "tensors must be 2-D"); - TORCH_CHECK( - mat1.dtype() == mat2.dtype(), - "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype() - ); - TORCH_CHECK(mat1_sizes[1] == mat2_sizes[0], "mat1 dim 1 must match mat2 dim 0"); - - auto abcType { mat1.options().dtype() }; - auto options { at::TensorOptions().dtype(abcType).device(at::kCUDA) }; - auto result { torch::empty({ mat1_sizes[0], mat2_sizes[1] }, options) }; - - bool transpose_result = true; - bool transpose_mat1; - bool transpose_mat2; - if ((mat2_strides[0] == 1) && (mat2_strides[1] >= std::max(1, mat2_sizes[0]))) { - transpose_mat2 = false; - } else if ((mat2_strides[1] == 1) && (mat2_strides[0] >= std::max(1, mat2_sizes[1]))) { - transpose_mat2 = true; - } else { - assert(false && "unusual strides detected, may need to clone a contiguous tensor"); - } - if ((mat1_strides[0] == 1) && (mat1_strides[1] >= std::max(1, mat1_sizes[0]))) { - transpose_mat1 = false; - } else if ((mat1_strides[1] == 1) && (mat1_strides[0] >= std::max(1, mat1_sizes[1]))) { - transpose_mat1 = true; - } else { - assert(false && "unusual strides detected, may need to clone a contiguous tensor"); - } - if (transpose_result) { - bool tmp = transpose_mat1; - transpose_mat1 = !transpose_mat2; - transpose_mat2 = !tmp; - mat1_strides = mat2.strides(); - mat2_strides = mat1.strides(); - mat1_sizes = mat2.sizes(); - mat2_sizes = mat1.sizes(); - } - float one { 1.0f }; - float zero { 0.0f }; - int64_t m = mat1_sizes[transpose_result ? 1 : 0]; - int64_t k = mat1_sizes[transpose_result ? 0 : 1]; - int64_t n = mat2_sizes[transpose_result ? 0 : 1]; - int64_t mat1_ld = mat1_strides[(transpose_mat1 == transpose_result) ? 1 : 0]; - int64_t mat2_ld = mat2_strides[(transpose_mat2 == transpose_result) ? 1 : 0]; - int64_t result_ld = result.stride(transpose_result ? 0 : 1); - - void *ptrA { static_cast((transpose_result ? mat2 : mat1).data_ptr()) }; - void *ptrB { static_cast((transpose_result ? mat1 : mat2).data_ptr()) }; - void *ptrC { static_cast(result.data_ptr()) }; - auto current_stream { torch::hip::getCurrentHIPStream().stream() }; - - rocblas_set_stream(r_handle, current_stream); - uint32_t flags { 0 }; - rocblas_datatype abcRtype; - if (abcType == at::kHalf) { - abcRtype = rocblas_datatype_f16_r; - } else if (abcType == at::kBFloat16) { - abcRtype = rocblas_datatype_bf16_r; - } else if (abcType == at::kFloat) { - abcRtype = rocblas_datatype_f32_r; - } else { - assert(false && "Wrong datatype!"); - } - - #define GEMM_EX_ARGS \ - r_handle, transpose_mat1 ? rocblas_operation_transpose : rocblas_operation_none, transpose_mat2 ? rocblas_operation_transpose : rocblas_operation_none, \ - m, n, k, &one, ptrA, abcRtype, mat1_ld, ptrB, abcRtype, mat2_ld, &zero, ptrC, \ - abcRtype, result_ld, ptrC, abcRtype, result_ld, rocblas_datatype_f32_r, rocblas_gemm_algo_solution_index - - rocblas_int sizeSolve; - //CHECK_ROCBLAS_ERROR( - rocblas_gemm_ex_get_solutions(GEMM_EX_ARGS, rocblas_gemm_flags_none, NULL, &sizeSolve); - - // Fill array with list of solutions that match type - // Note: some of these may be invalid - std::vector solutionsSolve(sizeSolve); - //CHECK_ROCBLAS_ERROR( - rocblas_gemm_ex_get_solutions(GEMM_EX_ARGS, rocblas_gemm_flags_none, solutionsSolve.data(), &sizeSolve); - - std::vector validSolutions; - for(auto sol : solutionsSolve) { - auto status = rocblas_gemm_ex(r_handle, - transpose_mat1 ? rocblas_operation_transpose : rocblas_operation_none, - transpose_mat2 ? rocblas_operation_transpose : rocblas_operation_none, - m, n, k, - &one, ptrA, abcRtype, mat1_ld, ptrB, abcRtype, mat2_ld, - &zero, ptrC, abcRtype, result_ld, - ptrC, abcRtype, result_ld, - rocblas_datatype_f32_r, rocblas_gemm_algo_solution_index, sol, rocblas_gemm_flags_none); - if (status == rocblas_status_success) { - validSolutions.push_back(sol); - } - } - - return validSolutions; -} -///////////////////////////////////////////////////////////////////////////////////////////////////////// -torch::Tensor RocSolIdxBlas( - const torch::Tensor& mat1, - const torch::Tensor& mat2, - const int32_t solution_index=0 - ) -{ - auto mat1_strides { mat1.strides() }; - auto mat2_strides { mat2.strides() }; - auto mat1_sizes { mat1.sizes() }; - auto mat2_sizes { mat2.sizes() }; - // std::cout << " | mat1 info: size: " << mat1_sizes << " stride: " << mat1_strides << std::endl - // << " | mat2 info: size: " << mat2_sizes << " stride: " << mat2_strides << std::endl; - - TORCH_CHECK(mat1.dim() == 2 && mat2.dim() == 2, "tensors must be 2-D"); - TORCH_CHECK( - mat1.dtype() == mat2.dtype(), - "expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype() - ); - TORCH_CHECK(mat1_sizes[1] == mat2_sizes[0], "mat1 dim 1 must match mat2 dim 0"); - - auto abcType { mat1.options().dtype() }; - auto options { at::TensorOptions().dtype(abcType).device(at::kCUDA) }; - auto result { torch::empty({ mat1_sizes[0], mat2_sizes[1] }, options) }; - // std::cout << " | result info: size: " << result.sizes() << " stride: " << result.strides() << std::endl; - - bool transpose_result = true; - bool transpose_mat1; - bool transpose_mat2; - if ((mat2_strides[0] == 1) && (mat2_strides[1] >= std::max(1, mat2_sizes[0]))) { - transpose_mat2 = false; - } else if ((mat2_strides[1] == 1) && (mat2_strides[0] >= std::max(1, mat2_sizes[1]))) { - transpose_mat2 = true; - } else { - assert(false && "unusual strides detected, may need to clone a contiguous tensor"); - } - if ((mat1_strides[0] == 1) && (mat1_strides[1] >= std::max(1, mat1_sizes[0]))) { - transpose_mat1 = false; - } else if ((mat1_strides[1] == 1) && (mat1_strides[0] >= std::max(1, mat1_sizes[1]))) { - transpose_mat1 = true; - } else { - assert(false && "unusual strides detected, may need to clone a contiguous tensor"); - } - - if (transpose_result) { - bool tmp = transpose_mat1; - transpose_mat1 = !transpose_mat2; - transpose_mat2 = !tmp; - mat1_strides = mat2.strides(); - mat2_strides = mat1.strides(); - mat1_sizes = mat2.sizes(); - mat2_sizes = mat1.sizes(); - } - // std::cout << " | transpose_result: " << (transpose_result ? "true" : "false") << std::endl - // << " | transpose_A: " << (transpose_mat1 ? "true" : "false") << std::endl - // << " | transpose_B: " << (transpose_mat2 ? "true" : "false") << std::endl; - // std::cout << " | A matrix: size: " << mat1_sizes << " stride: " << mat1_strides << std::endl - // << " | B matrix: size: " << mat2_sizes << " stride: " << mat2_strides << std::endl; - - float one { 1.0f }; - float zero { 0.0f }; - int64_t m = mat1_sizes[transpose_result ? 1 : 0]; - int64_t k = mat1_sizes[transpose_result ? 0 : 1]; - int64_t n = mat2_sizes[transpose_result ? 0 : 1]; - int64_t mat1_ld = mat1_strides[(transpose_mat1 == transpose_result) ? 1 : 0]; - int64_t mat2_ld = mat2_strides[(transpose_mat2 == transpose_result) ? 1 : 0]; - int64_t result_ld = result.stride(transpose_result ? 0 : 1); - // std::cout << " | (m, n, k): " << m << ", " << n << ", " << k << std::endl - // << " | (lda, ldb, ldc): " << mat1_ld << ", " << mat2_ld << ", " << result_ld << std::endl; - - /* - int flag { 0 }; - hipblasDatatype_t hipblasType; - if (abcType == at::kHalf) { - hipblasType = HIPBLAS_R_16F; - } else if (abcType == at::kBFloat16) { - hipblasType = HIPBLAS_R_16B; - } else if (abcType == at::kFloat) { - hipblasType = HIPBLAS_R_32F; - } else { - assert(false && "Wrong datatype!"); - } - */ - void *ptrA { static_cast((transpose_result ? mat2 : mat1).data_ptr()) }; - void *ptrB { static_cast((transpose_result ? mat1 : mat2).data_ptr()) }; - void *ptrC { static_cast(result.data_ptr()) }; - auto current_stream { torch::hip::getCurrentHIPStream().stream() }; - /* - - CHECK_HIPBLAS_ERROR(hipblasLtMatmul_wrapper( - hipblaslt_handle, - transpose_mat1 ? HIPBLAS_OP_T : HIPBLAS_OP_N, - transpose_mat2 ? HIPBLAS_OP_T : HIPBLAS_OP_N, - m, n, k, - &one, - ptrA, mat1_ld, - ptrB, mat2_ld, - &zero, - ptrC, result_ld, - hipblasType, - current_stream)); - */ - rocblas_set_stream(r_handle, current_stream); - uint32_t flags { 0 }; - //int32_t solution_index {0}; - rocblas_datatype abcRtype; - if (abcType == at::kHalf) { - abcRtype = rocblas_datatype_f16_r; - } else if (abcType == at::kBFloat16) { - abcRtype = rocblas_datatype_bf16_r; - } else if (abcType == at::kFloat) { - abcRtype = rocblas_datatype_f32_r; - } else { - assert(false && "Wrong datatype!"); - } - - //CHECK_ROCBLAS_ERROR( - rocblas_gemm_ex(r_handle, - transpose_mat1 ? rocblas_operation_transpose : rocblas_operation_none, - transpose_mat2 ? rocblas_operation_transpose : rocblas_operation_none, - m, n, k, - &one, ptrA, abcRtype, mat1_ld, ptrB, abcRtype, mat2_ld, - &zero, ptrC, abcRtype, result_ld, - ptrC, abcRtype, result_ld, - rocblas_datatype_f32_r, rocblas_gemm_algo_solution_index, solution_index, flags); - //); - - - return result; -} - -///////////////////////////////////////////////////////////////////////////////////////////////////////// - -void rocb_create_extension() -{ - /* - CHECK_HIP_ERROR(hipStreamCreate(&weight_stream)); - CHECK_HIP_ERROR(hipEventCreateWithFlags(&event, cudaEventDisableTiming)); - - // hipBLASLt - CHECK_HIPBLAS_ERROR(hipblasLtCreate(&hipblaslt_handle)); - CHECK_HIP_ERROR(hipMalloc(&d_workspace, workspace_size)); - CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceCreate(&preference)); - CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceSetAttribute( - preference, HIPBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_size, sizeof(workspace_size))); - - CHECK_HIP_ERROR(hipEventCreate(&start)); - CHECK_HIP_ERROR(hipEventCreate(&stop)); */ - rocblas_create_handle(&r_handle); -} - -///////////////////////////////////////////////////////////////////////////////////////////////////////// - -void rocb_destroy_extension() -{ - /* - CHECK_HIP_ERROR(hipStreamDestroy(weight_stream)); - CHECK_HIP_ERROR(hipEventDestroy(event)); - - // hipBLASLt - CHECK_HIPBLAS_ERROR(hipblasLtDestroy(hipblaslt_handle)); - CHECK_HIPBLAS_ERROR(hipblasLtMatmulPreferenceDestroy(preference)); - CHECK_HIP_ERROR(hipFree(d_workspace)); - - CHECK_HIP_ERROR(hipEventDestroy(start)); - CHECK_HIP_ERROR(hipEventDestroy(stop)); */ - rocblas_destroy_handle(r_handle); -} - -///////////////////////////////////////////////////////////////////////////////////////////////////////// - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) -{ - m.def("rocb_create_extension", &rocb_create_extension, "create_extension"); - m.def("rocb_destroy_extension", &rocb_destroy_extension, "destroy_extension"); - m.def("rocb_mm", &RocSolIdxBlas, "mm"); - m.def("rocb_findallsols", &RocFindAllSolIdxBlas, "rocblas_find_all_sols"); -} diff --git a/gradlib/gemm_runner.py b/gradlib/gemm_runner.py deleted file mode 100644 index 34a246771a820..0000000000000 --- a/gradlib/gemm_runner.py +++ /dev/null @@ -1,62 +0,0 @@ -import torch -import rocsolidxgemm -import hipbsolidxgemm -import numpy as np -import torch.nn.functional as F -import sys -import pandas as pd -import timeit - -rocsolidxgemm.rocb_create_extension() -hipbsolidxgemm.hipb_create_extension() - -class TunedGemm: - def __init__(self,tuned_csv_file): - self.bestsols = pd.read_csv(tuned_csv_file,index_col=[0]) - self.create_ds() - def create_ds(self): - df = self.bestsols - solds = {} - for i in range(len(df)): - ds = df.iloc[i] - key = (ds['M'],ds['N'],ds['K']) - if ds['libtype']=='hipblaslt': soltype = 1 - elif ds['libtype']=='rocblas': soltype = 2 - solds[key] = (soltype,int(ds['solidx'])) - #print(solds) - self.solids = solds - def query_sol(self,m,n,k): - return self.solids.get((m,n,k),(0,0)) - def mm(self,inp,weights): - soltype,solidx = self.query_sol(m=weights.shape[0],n=inp.shape[0],k=inp.shape[1]) - if soltype==1: - out = hipbsolidxgemm.hipb_mm(inp,weights.t(),solidx) - elif soltype==2: - out = rocsolidxgemm.rocb_mm(inp,weights.t(),solidx) - else: - out = F.linear(inp,weights) - return out - def run_all_tuned_sols(self): - for i in range(len(self.bestsols)): - ds = self.bestsols.iloc[i] - print('>>> Running tuned solution') - print(ds) - inp = torch.randn((ds['N'], ds['K']), dtype=get_dtype(ds['dtype']), device='cuda') - weights = torch.randn((ds['M'], ds['K']), dtype=get_dtype(ds['dtype']), device='cuda') - self.mm(inp,weights) - -def get_dtype(dtype_csv): - if dtype_csv=='torch.float16': - dtype = torch.float16 - elif dtype_csv=='torch.bfloat16': - dtype = torch.bfloat16 - elif dtype_csv=='torch.float32': - dtype = torch.float32 - return dtype - -if __name__ == '__main__': - tgemm = TunedGemm(sys.argv[1]) #csv file with tuned sols goes in argv[1] - print(tgemm.bestsols) - tgemm.run_all_tuned_sols() - - diff --git a/gradlib/gemm_tuner.py b/gradlib/gemm_tuner.py deleted file mode 100644 index b6c69379cf6c6..0000000000000 --- a/gradlib/gemm_tuner.py +++ /dev/null @@ -1,92 +0,0 @@ -import torch -import os -import argparse -from gradlib.GemmTuner import GemmTuner -import rocsolidxgemm -import hipbsolidxgemm -import numpy as np -import torch.nn.functional as F -import sys -import pandas as pd -import json -import random -from pathlib import Path -rocsolidxgemm.rocb_create_extension() -hipbsolidxgemm.hipb_create_extension() - -''' -{'architectures': ['LlamaForCausalLM'], 'bos_token_id': 1, 'eos_token_id': 2, 'hidden_act': 'silu', 'hidden_size': 5120, 'initializer_range': 0.02, -'intermediate_size': 13824, 'max_position_embeddings': 2048, 'model_type': 'llama', 'num_attention_heads': 40, 'num_hidden_layers': 40, 'num_key_value_heads': 40, -'pretraining_tp': 1, 'rms_norm_eps': 1e-05, 'rope_scaling': None, 'tie_word_embeddings': False, 'torch_dtype': 'float16', 'transformers_version': '4.33.0.dev0', 'use_cache': True, 'vocab_size': 32000} -''' -def generate_mk_sets(model_dir, tp=1): - f = open(f'{model_dir}/config.json') - data = json.load(f) - hidden_size = data['hidden_size'] - intermediate_size = data['intermediate_size'] - total_num_heads = data['num_attention_heads'] - total_num_kv_heads = data['num_key_value_heads'] - head_dim = hidden_size // total_num_heads - return [((total_num_heads + (2*total_num_kv_heads)) * head_dim // tp, hidden_size), (hidden_size, hidden_size // tp), (intermediate_size *2 // tp, hidden_size), (hidden_size, intermediate_size // tp) ], hidden_size - -def get_dtype(dtype_str): - dtype = torch.float16 - if dtype_str == 'f32': - dtype = torch.float32 - elif dtype_str == 'bf16': - dtype = torch.bfloat16 - elif dtype_str == 'f16': - dtype = torch.float16 - else: - print('>>> Warning! Invalid dtype', dtype_str, 'using default dtype f16') - return dtype - - -def list_of_ints(arg): - return list(map(int, arg.split(','))) - -def load_input_gemms(input_file): - if Path(input_file).is_file(): - return - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument("--model_dir", type=str, default=os.getenv('GTUNE_MODEL', ""), help="Enter the location of your model directory") - parser.add_argument("--tuned_file", type=str, default=os.getenv('GTUNE_TUNED', "tuned.csv"), help="output file for tuned gemm solutions") - parser.add_argument("--input_file", type=str, default=os.getenv('GTUNE_INPUT', None), help="list of gemms to tune for, mutually exclusive with model_dir") - parser.add_argument("--tp", type=int, default=os.getenv('GTUNE_TP', 1), help="Tensor parallelism to be used.") - parser.add_argument("--dtype", type=str, default='f16', help="dtype f32 f16 bf16") - parser.add_argument("--rocblas-decode", action="store_true", default=False, help="forces rocblas solution on decode N=1") - parser.add_argument("--batch_size", type=int, default=os.getenv('GTUNE_BATCH_SIZE', 1), help="Batch size to tune for") - parser.add_argument("--nsets", type=list_of_ints, default=[1, 512, 1024, 2048, 3072, 4096, 8192, 16384], help="N sizes to tune for: 1,128,2048") - args = parser.parse_args() - - dtype = get_dtype(args.dtype) - - gtuner = GemmTuner(dtype, args.tuned_file, args.rocblas_decode) - nsets = [i * args.batch_size for i in args.nsets] - if args.input_file: - print(f">>> Loading {args.input_file}") - if not Path(args.input_file).is_file(): - print(f">>> ERROR: {args.input_file} does not exist. Exiting") - exit(1) - shapes = pd.read_csv(args.input_file) - for i in range(len(shapes)): - ds = shapes.iloc[i] - gtuner.add_gemm(ds['M'],ds['N'],ds['K']) - else: - if not args.model_dir: - print(">>> Warning! NO MODEL SPECIFIED. Tuning for LL2 13B TP1") - #LL2 13B sizes - mksets = [(15360, 5120), (5120, 5120), (27648, 5120), (5120, 13824)] - gtuner.add_gemm(m=32000, n=1, k=5120) # logits gemm - else: - mksets, hidden_size = generate_mk_sets(args.model_dir, args.tp) - gtuner.add_gemm(m=32000//args.tp, n=1 * args.batch_size, k=hidden_size) #TODO: Handle cases where vocab_size is not divisible by tp - - for n in sorted(nsets): - for m, k in mksets: - gtuner.add_gemm(m, n, k) - - gtuner.find_best_sols() diff --git a/gradlib/gradlib/GemmTuner.py b/gradlib/gradlib/GemmTuner.py deleted file mode 100644 index 273042cb12a05..0000000000000 --- a/gradlib/gradlib/GemmTuner.py +++ /dev/null @@ -1,208 +0,0 @@ -import torch -import os -import argparse -import rocsolidxgemm -import hipbsolidxgemm -import numpy as np -import torch.nn.functional as F -import sys -import pandas as pd -import json -import random -from pathlib import Path -rocsolidxgemm.rocb_create_extension() -hipbsolidxgemm.hipb_create_extension() - -rtol = 1e-5 -atol = 1 -dtype = torch.float16 - -class Gemm: - def __init__(self,m,n,k,dtype,rocblas_decode=False): - self.m=m - self.k=k - self.n=n - self.dtype=dtype - self.nb = 37 - self.inp = torch.randn((self.n, self.k), dtype=self.dtype, device='cuda') - self.weights = torch.randn((self.m, self.k), dtype=self.dtype, device='cuda') - #weights2 is used in measurement/warm iters to ensure HBM fetch for weight tensors - self.weights2 = torch.randn((self.nb, self.m, self.k), dtype=self.dtype, device='cuda') - self.blob = torch.ones(128*1024*1024, dtype=torch.float32, device='cuda') - self.topn = 20 #number of top solutions from each source - self.hipb_sols=[] - self.rtol = 1e-5 - self.atol = 1 - self.start = torch.cuda.Event(enable_timing=True) - self.end = torch.cuda.Event(enable_timing=True) - self.hipb_prefer_ratio = 0.995 #prefer hipblaslt unless rocblas time is less than this ratio of hipblaslt time - self.rocblas_decode=rocblas_decode - - - def find_hipblas_sols(self): - sols = hipbsolidxgemm.hipb_findallsols(self.inp,self.weights.t()) - print('M N K',self.m,self.n,self.k,'>>> Total hipb solutions',len(sols), flush=True) - #print(sols) - self.hipb_sols = sols - - - def check_gemm_ref(self,libtype,solidx): - ref = F.linear(self.inp,self.weights) - if libtype == 'hipblaslt': - c = hipbsolidxgemm.hipb_mm(self.inp,self.weights.t(),solidx) - elif libtype == 'rocblas': - c = rocsolidxgemm.rocb_mm(self.inp,self.weights.t(),solidx) - if torch.allclose(c, ref, atol=self.atol, rtol=self.rtol): - #print('>>>',libtype,'Solidx',solidx,'passed reference test') - return True - else: - print('>>>',libtype,'Solidx',solidx,'FAILED reference test', flush=True) - print(ref, flush=True) - print(c, flush=True) - return False - def hipb_time_sol(self,solidx,cold_iters=2,warm_iters=10): - #print('>>>hipbtime',solidx) - for i in range(cold_iters): - c = hipbsolidxgemm.hipb_mm(self.inp,self.weights.t(),solidx) - self.start.record() - for i in range(warm_iters): - c = hipbsolidxgemm.hipb_mm(self.inp,self.weights2 [random.randint(0,self.nb-1)].t(),solidx) - self.end.record() - torch.cuda.synchronize() - gtime = self.start.elapsed_time(self.end)/warm_iters - #print('>>> Solidx GTime',solidx,gtime,'ms') - return gtime - def hipb_time_all_sols(self,fast_mode=0,top_sols=0): - coldi=20; warmi=20 - if fast_mode: coldi=2; warmi=2 - solutions = self.hipb_sols - if top_sols: solutions = self.hipb_top_sols - gtimes = {} - for solidx in solutions: - gtimes[solidx] = self.hipb_time_sol(solidx, cold_iters=coldi, warm_iters=warmi) - self.hipb_gtimedf = pd.DataFrame.from_dict(gtimes,orient='index',columns=['gtimems']).sort_values(by='gtimems') - self.hipb_gtimedf.to_csv('/tmp/hipb_gtimedf.csv') - print('>>> HipBlasLt top solutions, Fast Mode',fast_mode) - print(self.hipb_gtimedf.head(self.topn)) - def rocb_time_sol(self, solidx, cold_iters=2, warm_iters=10): - for i in range(cold_iters): - c = rocsolidxgemm.rocb_mm(self.inp, self.weights.t(), solidx) - self.start.record() - for i in range(warm_iters): - c = rocsolidxgemm.rocb_mm(self.inp, self.weights2[random.randint(0, self.nb-1)].t(), solidx) - self.end.record() - torch.cuda.synchronize() - gtime = self.start.elapsed_time(self.end)/warm_iters - #print('>>> RocSolidx GTime',solidx,gtime,'ms') - return gtime - def find_rocblas_sols(self): - sols = rocsolidxgemm.rocb_findallsols(self.inp,self.weights.t()) - print('M N K',self.m,self.n,self.k,'>>> Total rocb solutions',len(sols), flush=True) - #print(sols) - self.rocb_sols = sols - def rocb_time_all_sols(self,fast_mode=0,top_sols=0): - coldi=20; warmi=20 - if fast_mode: coldi=2; warmi=2 - solutions = self.rocb_sols - if top_sols: solutions = self.rocb_top_sols - gtimes = {} - for solidx in solutions: - gtimes[solidx] = self.rocb_time_sol(solidx,coldi,warmi) - self.rocb_gtimedf = pd.DataFrame.from_dict(gtimes,orient='index',columns=['gtimems']).sort_values(by='gtimems') - self.rocb_gtimedf.to_csv('/tmp/rocb_gtimedf.csv') - print('>>> Rocblas top solutions, Fast Mode',fast_mode, flush=True) - print(self.rocb_gtimedf.head(self.topn), flush=True) - def warmup(self,warmi=500): - for i in range(warmi): - self.blob = self.blob + 0.00001 - def functional_check_topn_fastest(self): - rocb_topn = [] - for solidx in self.rocb_gtimedf.index[:self.topn]: - if self.check_gemm_ref(libtype='rocblas',solidx=solidx): - rocb_topn.append(solidx) - self.rocb_top_sols = rocb_topn - hipb_topn = [] - for solidx in self.hipb_gtimedf.index[:self.topn]: - if self.check_gemm_ref(libtype='hipblaslt',solidx=solidx): - hipb_topn.append(solidx) - self.hipb_top_sols = hipb_topn - - def find_fastest_solution(self): - self.find_rocblas_sols() - if not (self.rocblas_decode and self.n == 1): - self.find_hipblas_sols() - self.warmup() - self.rocb_time_all_sols(fast_mode=1) - self.warmup() - self.hipb_time_all_sols(fast_mode=1) - self.functional_check_topn_fastest() - self.warmup() - self.rocb_time_all_sols(fast_mode=0,top_sols=1) - self.warmup() - self.hipb_time_all_sols(fast_mode=0,top_sols=1) - if len(self.rocb_gtimedf)>0 and len(self.hipb_gtimedf)>0: - best_rocb_time = self.rocb_gtimedf.gtimems.iloc[0] - best_hipb_time = self.hipb_gtimedf.gtimems.iloc[0] - if best_rocb_time0: - print('>>> Only hipblas solutions found!',flush=True) - best_hipb_time = self.hipb_gtimedf.gtimems.iloc[0] - self.best_libtype = 'hipblaslt' - self.best_solidx = self.hipb_gtimedf.index[0] - self.best_soltime = best_hipb_time - elif len(self.rocb_gtimedf)>0: - print('>>> Only rocblas solutions found!',flush=True) - best_rocb_time = self.rocb_gtimedf.gtimems.iloc[0] - self.best_libtype = 'rocblas' - self.best_solidx = self.rocb_gtimedf.index[0] - self.best_soltime = best_rocb_time - else: - print('>>> No rocblas or hipblas solutions found!',flush=True) - self.best_libtype = 'rocblas' - self.best_solidx = 0 - self.best_soltime = 0 - print('>>> Fastest Solution is',self.best_libtype,self.best_solidx,self.best_soltime,flush=True) - - -class GemmTuner: - def __init__(self, dtype, tuned_file=None, rocblas_decode=False): - self.gemm_problems = pd.DataFrame(columns=['M','N','K']) - self.dtype = dtype - self.rocblas_decode = rocblas_decode - self.tuned_file = tuned_file - if Path(tuned_file).is_file(): - self.gdf = pd.read_csv(tuned_file) - else: - self.gdf = None - - def add_gemm(self,m,n,k): - if ( self.gdf is None or (self.gdf[(self.gdf['M'] == m) & (self.gdf['N'] == n) & (self.gdf['K'] == k)].empty)): - entry = {'M':[m], 'N':[n], 'K':[k]} - df = pd.DataFrame(entry) - self.gemm_problems = pd.concat([self.gemm_problems, df],ignore_index=True) - else: - print(f">>>Info: Found Duplicate shape(M:{m}, N:{n}, K:{k}), skipping") - - def find_best_sols(self): - df = self.gemm_problems - soldf = pd.DataFrame() - for i in range(len(df)): - ds = df.iloc[i] - gemmobj = Gemm(ds['M'],ds['N'],ds['K'],dtype=self.dtype, rocblas_decode=self.rocblas_decode) - gemmobj.find_fastest_solution() - soldf.loc[i,'libtype'] = gemmobj.best_libtype - soldf.loc[i,'solidx'] = gemmobj.best_solidx - soldf.loc[i,'soltimems'] = gemmobj.best_soltime - soldf['dtype'] = self.dtype - finaldf = pd.concat([self.gemm_problems, soldf],axis=1) - finaldf = pd.concat([finaldf, self.gdf]) - finaldf.to_csv(self.tuned_file, index=False) - print(finaldf) diff --git a/gradlib/mm_test.py b/gradlib/mm_test.py deleted file mode 100644 index 1b21b9ca105ff..0000000000000 --- a/gradlib/mm_test.py +++ /dev/null @@ -1,234 +0,0 @@ -import torch -#import gradlib -import rocsolidxgemm -import hipbsolidxgemm -import numpy as np -import torch.nn.functional as F -import sys -import pandas as pd -#gradlib.create_extension() -rocsolidxgemm.rocb_create_extension() -hipbsolidxgemm.hipb_create_extension() - -#m = 128; n = 192 ;k = 256 -#m = 7168; k = 4096*2; n = 256 -#m = int(1024*1.25); k = int(1024*8); n = 1 -#m = 1; k = int(1024*8); n = int(1024*7) -#m=22016; k=4096 ; n=1 -#m=int(27648/1);k=5120;n=8 -#m=5120;k=13824;n=1 -m=3*5120;k=5120;n=1 - - -rtol = 1e-5 -atol = 1 -dtype = torch.float16 - -class Gemm: - def __init__(self,m,n,k,dtype=torch.float16): - self.m=m - self.k=k - self.n=n - self.dtype=dtype - self.inp = torch.randn((self.n, self.k), dtype=self.dtype, device='cuda') - self.weights = torch.randn((self.m, self.k), dtype=self.dtype, device='cuda') - self.hipb_sols=[] - self.rtol = 1e-5 - self.atol = 1 - self.cold_iters = 2 - self.warm_iters = 10 - def find_hipblas_sols(self): - sols = hipbsolidxgemm.hipb_findallsols(self.inp,self.weights.t()) - print('M N K',self.m,self.n,self.k,'>>> Total hipb solutions',len(sols)) - #print(sols) - self.hipb_sols = sols - def hipb_check_gemm_ref(self,user_solidxs=None): - ref = F.linear(self.inp,self.weights) - if user_solidxs is not None: - solidxs = user_solidxs - else: - solidxs = self.hipb_sols - if len(solidxs)>0: - for solidx in solidxs: - c = hipbsolidxgemm.hipb_mm(self.inp,self.weights.t(),solidx) - if torch.allclose(c, ref, atol=self.atol, rtol=self.rtol): - print('>>> Hipb solidx',solidx,'passed reference test') - else: - print('>>> Hipb solidx',solidx,'FAILED reference test') - print(ref) - print(c) - def hipb_time_sol(self,solidx): - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) - for i in range(self.cold_iters): - c = hipbsolidxgemm.hipb_mm(self.inp,self.weights.t(),solidx) - start.record() - for i in range(self.warm_iters): - c = hipbsolidxgemm.hipb_mm(self.inp,self.weights.t(),solidx) - end.record() - torch.cuda.synchronize() - gtime = start.elapsed_time(end)/self.warm_iters - #print('>>> Solidx GTime',solidx,gtime,'ms') - return gtime - def hipb_time_all_sols(self): - gtimes = {} - for solidx in self.hipb_sols: - gtimes[solidx] = self.hipb_time_sol(solidx) - self.gtimedf = pd.DataFrame.from_dict(gtimes,orient='index',columns=['gtimems']).sort_values(by='gtimems') - self.gtimedf.to_csv('/tmp/gtimedf.csv') - print(self.gtimedf.head(10)) - - - -gemmobj = Gemm(m=3*5120,n=1,k=5120) -gemmobj.find_hipblas_sols() -#gemmobj.hipb_check_gemm_ref() -#gemmobj.hipb_check_gemm_ref(user_solidxs=[131,8190]) -#gemmobj.hipb_time_sol(gemmobj.hipb_sols[0]) -gemmobj.hipb_time_all_sols() -gemmobj.hipb_check_gemm_ref(user_solidxs=gemmobj.gtimedf.head(5).index.values) - -sys.exit() -def splitk_linear(inp,w,splitk=2): - wsp = torch.chunk(w,splitk,dim=1) - isp = torch.chunk(inp,splitk,dim=1) - print('>>>',isp[0].shape,wsp[1].shape) - cnew = [] - for i in range(splitk): - cnew.append(F.linear(isp[i],wsp[i])) - #cnew1 = F.linear(isp[1],wsp[1]) - c = cnew[0] - for i in range(1,splitk): - c.add_(cnew[i]) - #c = torch.add(cnew0,cnew1) - - return c - -def splitm_linear(inp,w,splitm=2,splits=None,splitk=1): - outputp=[] - #wsp = torch.chunk(F.pad(weights,(0,0,0,padm)),splitm) - if splits is not None: - wsp = torch.split(w,splits) - else: - wsp = torch.chunk(w,splitm) - #cout = torch.empty(inp.shape[0],w.shape[0],dtype=inp.dtype,device=inp.device) - #csp = torch.chunk(cout,splitm,dim=1) - - for i,_ in enumerate(wsp): - #print('>>>wspi',wsp[i].shape) - if splitk==1: - outputp.append(F.linear(inp, wsp[i])) - #cout[:,i*wsp[i].shape[0]:(i+1)*wsp[i].shape[0]] = F.linear(inp, wsp[i]) - #csp[i].copy_(F.linear(inp, wsp[i])) - else: - outputp.append(splitk_linear(inp,wsp[i],splitk)) - c = torch.cat((outputp),dim=1) - #print('>>>',c.shape,cout.shape) - return c - -def splitn_linear(inp,w,splitn=2,splits=None): - outputp=[] - if splits is not None: - isp = torch.split(inp,splits) - else: - isp = torch.chunk(inp,splitn) - cout = torch.empty(inp.shape[0],w.shape[0],dtype=inp.dtype,device=inp.device) - for i,_ in enumerate(isp): - outputp.append(F.linear(isp[i], w)) - #cout[i*isp[i].shape[0]:(i+1)*isp[i].shape[0],:] = F.linear(isp[i], w) - c = torch.cat((outputp),dim=0) - #print('>>>',c.shape,cout.shape) - return c - -nncount = 0 -for _ in range(10): - #a = torch.randn((m, k), dtype=dtype, device='cuda') - #b = torch.randn((k, n), dtype=dtype, device='cuda') - inp = torch.randn((n, k), dtype=dtype, device='cuda') - weights = torch.randn((m, k), dtype=dtype, device='cuda') - #c = gradlib.mm(inp, weights.t()) - c = hipbsolidxgemm.hipb_mm(inp,weights.t(),20053) - c = hipbsolidxgemm.hipb_mm(inp,weights.t(),20053) - c = rocsolidxgemm.rocb_mm(inp,weights.t(),60995) - c = rocsolidxgemm.rocb_mm(inp,weights.t(),60995) - - splitm=2 - #padm=2 - outsp=[] - #wsp = torch.chunk(F.pad(weights,(0,0,0,padm)),splitm) - #wsp = torch.chunk(weights,splitm) - #wsp = torch.split(weights,(3*1024,4*1024)) - #c = torch.empty((n,m),dtype=dtype,device='cuda') - #outtup = [] - #for i,_ in enumerate(wsp): - # print('>>>wspi',wsp[i].shape) - # outsp.append(F.linear(inp, wsp[i])) - # #outtup.append(splitk_linear(inp, wsp[i])) - #outsp = [torch.add(a,b) for a,b in outtup] - #c = torch.cat((outsp),dim=1) - #c = c[:,:-padm] - #c = splitm_linear(inp,weights,splitm=4,splits=None,splitk=1) - #c = splitn_linear(inp,weights,splitn=2,splits=None) - - #wsp = torch.chunk(weights,2,dim=1) - #isp = torch.chunk(inp,2,dim=1) - #print('>>>',isp[0].shape,wsp[1].shape) - #cnew0 = F.linear(isp[0],wsp[0]) - #cnew1 = F.linear(isp[1],wsp[1]) - #c = torch.add(cnew0,cnew1) - #c = splitk_linear(inp, weights, splitk=4) - - #torch.cuda.synchronize() - ref = F.linear(inp,weights) - #ref = torch.matmul(a,b) - if torch.allclose(c, ref, atol=atol, rtol=rtol): - nncount += 1 - else: - print(ref) - print(c) -''' -tncount = 0 -for _ in range(10): - a = torch.randn((m, k), dtype=dtype, device='cuda') - b = torch.randn((n, k), dtype=dtype, device='cuda') - c = gradlib.mm(a, b.t()) - #torch.cuda.synchronize() - ref = torch.matmul(a, b.t()) - if torch.allclose(c, ref, atol=atol, rtol=rtol): - tncount += 1 - else: - print(ref) - print(c) - #torch.save(c-ref, '/tmp/difference.pt') - #np.savetxt('my_file.txt', (c-ref).cpu().numpy()) - dfs = ref - c - nz = torch.nonzero(dfs,as_tuple=True) - print(nz) - print(dfs[nz]) - print(ref[nz]) - print(c[nz]) -''' -''' -ntcount = 0 -for _ in range(10): - a = torch.randn((k, m), dtype=dtype, device='cuda') - b = torch.randn((k, n), dtype=dtype, device='cuda') - c = gradlib.mm(a.t(), b) - #torch.cuda.synchronize() - if torch.allclose(c, torch.matmul(a.t(), b), atol=atol, rtol=rtol): - ntcount += 1 - -ttcount = 0 -for _ in range(10): - a = torch.randn((k, m), dtype=dtype, device='cuda') - b = torch.randn((n, k), dtype=dtype, device='cuda') - c = gradlib.mm(a.t(), b.t()) - torch.cuda.synchronize() - if torch.allclose(c, torch.matmul(a.t(), b.t()), atol=atol, rtol=rtol): - ttcount += 1 -''' -print(f"GEMM (m, n, k) = {n}, {m}, {k}") -print(f"NN GEMMs: pass {nncount}/10, tol={rtol}") -#print(f"TN GEMMs: pass {tncount}/10, tol={rtol}") -#print(f"NT GEMMs: pass {ntcount}/10, tol={rtol}") -#print(f"TT GEMMs: pass {ttcount}/10, tol={rtol}") diff --git a/gradlib/setup.py b/gradlib/setup.py deleted file mode 100644 index 1ca83dbe79f6c..0000000000000 --- a/gradlib/setup.py +++ /dev/null @@ -1,136 +0,0 @@ -import torch -import setuptools -from setuptools import setup -from torch.utils.cpp_extension import BuildExtension, CUDAExtension -from torch.utils.hipify import hipify_python -import os -import subprocess -import re - -this_dir = os.path.dirname(os.path.abspath(__file__)) -#gpus = subprocess.check_output("/opt/rocm/bin/rocminfo").decode('UTF-8').split('\n') -#gpus = list(set([re.search('(gfx94.)', g).group(0) for g in gpus if 'gfx94' in g])) -gpus = ['gfx90a','gfx940','gfx941','gfx942'] -#gpus = ['gfx90a','gfx940'] -extra_args = ["--offload-arch=" + g for g in gpus] - - -#sets_rocm_pytorch = False -maj_ver, min_ver, *_ = torch.__version__.split('.') -if int(maj_ver) > 1 or (int(maj_ver) == 1 and int(min_ver) >= 5): - from torch.utils.cpp_extension import ROCM_HOME - is_rocm_pytorch = True if ((torch.version.hip is not None) and (ROCM_HOME is not None)) else False - -ext_modules = [] - -generator_flag = [] -torch_dir = torch.__path__[0] -if os.path.exists(os.path.join(torch_dir, 'include', 'ATen', 'CUDAGenerator.h')): - generator_flag = ['-DOLD_GENERATOR'] - -print("\n\ntorch.__version__ = {}\n\n".format(torch.__version__)) -TORCH_MAJOR = int(torch.__version__.split('.')[0]) -TORCH_MINOR = int(torch.__version__.split('.')[1]) - -version_ge_1_1 = [] -if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0): - version_ge_1_1 = ['-DVERSION_GE_1_1'] -version_ge_1_3 = [] -if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2): - version_ge_1_3 = ['-DVERSION_GE_1_3'] -version_ge_1_5 = [] -if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4): - version_ge_1_5 = ['-DVERSION_GE_1_5'] -version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5 - -include_dirs=[os.path.join(this_dir, 'csrc')] - -#if is_rocm_pytorch: -# import shutil -# with hipify_python.GeneratedFileCleaner(keep_intermediates=True) as clean_ctx: -# hipify_python.hipify(project_directory=this_dir, output_directory=this_dir, includes="csrc/*", -# show_detailed=True, is_pytorch_extension=True, clean_ctx=clean_ctx) - -if not is_rocm_pytorch: - ext_modules.append( - CUDAExtension( - name='gradlib', - sources=['grad_funcs.cu'], - extra_compile_args={ - 'cxx': ['-O3',], - 'nvcc':['-O3','-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', "--expt-relaxed-constexpr", "-ftemplate-depth=1024", '-gencode=arch=compute_70,code=sm_70','-gencode=arch=compute_80,code=sm_80','-gencode=arch=compute_80,code=compute_80'] - } - ) - ) -elif is_rocm_pytorch: - #if torch.__version__ <= '1.8': - hipify_ver = [int(x) for x in torch.utils.hipify.__version__.split(".")] if hasattr(torch.utils.hipify, "__version__") else [0,0,0] - if hipify_ver < [1,0,0]: - import shutil - with hipify_python.GeneratedFileCleaner(keep_intermediates=True) as clean_ctx: - hipify_python.hipify(project_directory=this_dir, output_directory=this_dir, includes="csrc/*", - show_detailed=True, is_pytorch_extension=True, clean_ctx=clean_ctx) - - ext_modules.append( - CUDAExtension( - name='gradlib', - sources=['./csrc/hip/grad_funcs.hip'], - extra_compile_args={ - 'cxx': ['-O3',] + version_dependent_macros, - 'nvcc':['-O3'] + extra_args - } - ) - ) - else: - #ext_modules.append( - # CUDAExtension( - # name='gradlib', - # sources=['./csrc/grad_funcs.cu'], - # include_dirs=include_dirs, - # # add additional libraries argument for hipblaslt - # libraries=['hipblaslt'], - # extra_compile_args={ - # 'cxx': ['-O3',], - # 'nvcc':['-O3','-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', - # "-ftemplate-depth=1024"] + extra_args - # } - # ) - # ) - ext_modules.append( - CUDAExtension( - name='rocsolidxgemm', - sources=['./csrc/rocsolgemm.cu'], - include_dirs=include_dirs, - # add additional libraries argument for hipblaslt - libraries=['rocblas'], - extra_compile_args={ - 'cxx': ['-O3',], - 'nvcc':['-O3','-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', - "-ftemplate-depth=1024"] + extra_args - } - ) - ) - ext_modules.append( - CUDAExtension( - name='hipbsolidxgemm', - sources=['./csrc/hipbsolgemm.cu'], - include_dirs=include_dirs, - # add additional libraries argument for hipblaslt - libraries=['hipblaslt'], - extra_compile_args={ - 'cxx': ['-O3',], - 'nvcc':['-O3','-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', - "-ftemplate-depth=1024"] + extra_args - } - ) - ) - -setup( - name='gradlib', - packages=['gradlib'], - ext_modules=ext_modules, - cmdclass={ - 'build_ext': BuildExtension -}) - -# python setup.py build && cp build/lib*/gradlib* ../ diff --git a/run.sh b/run.sh deleted file mode 100755 index 7b9336a0a076a..0000000000000 --- a/run.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -BASE_DIR=/trees/ -VLLM_DIR=$BASE_DIR/vllm -GRAD_DIR=$BASE_DIR/gradlib -RPD_DIR=/workspace/rocmProfileData -MODEL=/data/llama2-70b-chat -#MODEL=/data/Llama-2-13B-Chat-fp16 -#MODEL=/data/llama-2-13b-chat-hf -MODEL_SIZE=`echo $MODEL | sed 's/.*\(.[0-9][bB]\).*/\1/'` - -GEN_LEN="8" -TP=8 -INPUT_LEN=2048 -ITER=1 -cd $VLLM_DIR - - echo "tuned_gemm_csv: ./tuned_tp$TP.csv" > $VLLM_DIR/tuned_perf_tp$TP.yaml - tuned_file=$VLLM_DIR/tuned_tp$TP.csv -export VLLM_PERF_YAML=./tuned_perf_tp$TP.yaml - -for tp in $TP; -do - for gen_len in $GEN_LEN; - do - for input_len in $INPUT_LEN; - do - -python benchmarks/benchmark_latency.py --model $MODEL --batch-size 1 --input-len $input_len --output-len $gen_len \ - --tensor-parallel-size $tp --num-iters $ITER - done -done -done diff --git a/run_70b.sh b/run_70b.sh deleted file mode 100755 index ed004b56c17d3..0000000000000 --- a/run_70b.sh +++ /dev/null @@ -1,98 +0,0 @@ -#!/bin/bash -BASE_DIR=/trees -VLLM_DIR=$BASE_DIR/vllm -GRAD_DIR=$BASE_DIR/gradlib -RPD_DIR=/workspace/rocmProfileData -MODEL=/data/llama2-70b-chat -MODEL_SIZE=`echo $MODEL | sed 's/.*\(.[0-9][bB]\).*/\1/'` -#MODEL=/data/llama-2-13b-chat-hf -GEMM_TUNER=1 -#TP="1 2 4 8" -TP=8 -#Flag to use Triton Flash Attention vs CK -#export VLLM_USE_TRITON=1 - -#Gemm tuner flags -export VLLM_TUNE_GEMM=0 -export VLLM_UNTUNE_FILE="/tmp/vllm_untuned.csv" -export VLLM_TUNE_FILE=$VLLM_DIR"/tuned.csv" - -#Flag to use old torch.multinomial -#export VLLM_USE_TORCH_MULTINOMIAL=1 - -#Delete tuned gemms before running. -#DELETE_TUNED_CSV=1 -#Flag to disable MSCCL -#export RCCL_MSCCL_ENABLE=0 -#HIPGraph performance flags -export HIP_FORCE_DEV_KERNARG=1 -export DEBUG_CLR_GRAPH_PACKET_CAPTURE=1 -#Enable full decoder graph mode -#Use top of tree build of RCCL -export LD_LIBRARY_PATH=/workspace/rccl/build/ -#Enable either flag to create a profile trace (rocprof, or rocpd) -#RPD_PROFILE="--profile" -#ROCPROF_PROFILE="rocprof --hip-trace" -GEN_LEN="1,32,128" -INPUT_LEN="512,1024,2048,3072" - -ITER=10 -# pring usage of the parameters -usage() { - echo "Usage: $0 [--tp ] [--model ]" - exit 1 -} -# parse parameters -while [[ "$#" -gt 0 ]]; do - case $1 in - --tp) TP="$2"; shift ;; - --model) MODEL="$2"; shift ;; - --notune) GEMM_TUNER=0; shift ;; - *) usage ;; # Any other argument will show usage information. - esac - shift # Move to next argument -done -for tp in $TP; -do - if (( $GEMM_TUNER )); - then - echo "tuned_gemm_csv: "$VLLM_TUNE_FILE > $VLLM_DIR/tuned_perf_tp$tp.yaml - - if [[ $DELETE_TUNED_CSV == 1 ]]; - then - rm -rf $VLLM_TUNE_FILE - fi - #export VLLM_PERF_YAML=./tuned_perf_tp$tp.yaml - echo "INFO: Generating Tuned Gemm configs" - cd $GRAD_DIR - python gemm_tuner.py --model_dir $MODEL --tuned_file $VLLM_TUNE_FILE --tp $tp - fi - - cd $VLLM_DIR - for gen_len in $GEN_LEN; - do - for input_len in $INPUT_LEN; - do - if [[ -v RPD_PROFILE ]] ; - then - rm /workspace/trace.rpd - python -m rocpd.schema --create /workspace/trace.rpd - fi - echo "================================= RUNNING $MODEL $input_len $gen_len ===============================================" - $ROCPROF_PROFILE torchrun --standalone --nnodes=1 --nproc-per-node=$tp benchmarks/benchmark_latency.py --model $MODEL --batch-size 1 --input-len $input_len --output-len $gen_len \ - --tensor-parallel-size $tp --num-iters $ITER $RPD_PROFILE --report - if [[ -v ROCPROF_PROFILE ]] ; - then - TRACE_FILE=$BASE_DIR/trace_${MODEL_SIZE}_${input_len}_${gen_len}.json - echo "INFO: Creating Trace JSON file $TRACE_FILE" - mv $VLLM_DIR/results.json $TRACE_FILE - fi - if [[ -v RPD_PROFILE ]] ; - then - TRACE_FILE=$BASE_DIR/trace_${MODEL_SIZE}_${input_len}_${gen_len}.json - echo "INFO: Creating Trace JSON file $TRACE_FILE" - python $RPD_DIR/tools/rpd2tracing.py --format object $BASE_DIR/trace.rpd $TRACE_FILE - fi - done - done -done diff --git a/run_70b_fast.sh b/run_70b_fast.sh deleted file mode 100755 index 0ed20e59ca3ff..0000000000000 --- a/run_70b_fast.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash -set -e -BASE_DIR=/trees -VLLM_DIR=$BASE_DIR/vllm -GRAD_DIR=$BASE_DIR/gradlib -RPD_DIR=/workspace/rocmProfileData -MODEL=/data/llama2-70b-chat -MODEL_SIZE=`echo $MODEL | sed 's/.*\(.[0-9][bB]\).*/\1/'` - -export VLLM_TUNE_GEMM=0 -export VLLM_UNTUNE_FILE="/tmp/vllm_untuned.csv" -export VLLM_TUNE_FILE=$VLLM_DIR/"tuned.csv" - -#Flag to use Triton Flash Attention vs CK -export VLLM_USE_TRITON=1 - -#Flag to use old torch.multinomial -#export VLLM_USE_TORCH_MULTINOMIAL=1 - -#Delete tuned gemms before running. -#DELETE_TUNED_CSV=1 - -#Flag to disable MSCCL -#export RCCL_MSCCL_ENABLE=0 - -#HIPGraph performance flags -export HIP_FORCE_DEV_KERNARG=1 -export DEBUG_CLR_GRAPH_PACKET_CAPTURE=1 - - -#Use top of tree build of RCCL -export LD_LIBRARY_PATH=/workspace/rccl/build/ - -#Enable either flag to create a profile trace (rocprof, or rocpd) -#RPD_PROFILE="--profile" -#ROCPROF_PROFILE="rocprof --hip-trace" - -#TP="1 2 4 8" -TP=8 -GEN_LEN="32" -INPUT_LEN="512,1024,2048,3072" -#INPUT_LEN="512,1024,2048,3072,4096,6144,8192,16384" -BATCH_SIZE="1" -ITER=10 - -rm -f $VLLM_UNTUNE_FILE -for tp in $TP; -do - cd $VLLM_DIR - export VLLM_TUNE_GEMM=1 - echo "================================= WARMING UP $MODEL ===============================================" - $ROCPROF_PROFILE torchrun --standalone --nnodes=1 --nproc-per-node=$tp benchmarks/benchmark_latency.py --model $MODEL --batch-size $BATCH_SIZE --input-len $INPUT_LEN --output-len $GEN_LEN \ - --tensor-parallel-size $tp --num-iters 1 --warmup-only - - if [ -f $VLLM_UNTUNE_FILE ]; then - echo "=============================== Tuning ======================================" - python $GRAD_DIR/gemm_tuner.py --tuned_file $VLLM_TUNE_FILE --input_file $VLLM_UNTUNE_FILE - echo "File does not exist." - fi - - export VLLM_TUNE_GEMM=0 - echo "================================= RUNNING $MODEL ===============================================" - $ROCPROF_PROFILE torchrun --standalone --nnodes=1 --nproc-per-node=$tp benchmarks/benchmark_latency.py --model $MODEL --batch-size $BATCH_SIZE --input-len $INPUT_LEN --output-len $GEN_LEN \ - --tensor-parallel-size $tp --num-iters $ITER --report --report-file=$VLLM_DIR/report.csv -done \ No newline at end of file diff --git a/run_llama2.sh b/run_llama2.sh deleted file mode 100755 index 1444ca7d222a1..0000000000000 --- a/run_llama2.sh +++ /dev/null @@ -1,98 +0,0 @@ -#!/bin/bash -BASE_DIR=/workspace -VLLM_DIR=$BASE_DIR/vllm -GRAD_DIR=$VLLM_DIR/gradlib -RPD_DIR=/workspace/rocmProfileData -MODEL=/data/llama2-70b-chat -MODEL_SIZE=`echo $MODEL | sed 's/.*\(.[0-9][bB]\).*/\1/'` -#MODEL=/data/llama-2-13b-chat-hf -GEMM_TUNER=1 -#TP="1 2 4 8" -TP=8 -#Flag to use Triton Flash Attention vs CK -#export VLLM_USE_TRITON=1 - -#Gemm tuner flags -export VLLM_TUNE_GEMM=0 -export VLLM_UNTUNE_FILE="/tmp/vllm_untuned.csv" -export VLLM_TUNE_FILE=$VLLM_DIR"/tuned.csv" - -#Flag to use old torch.multinomial -#export VLLM_USE_TORCH_MULTINOMIAL=1 - -#Delete tuned gemms before running. -#DELETE_TUNED_CSV=1 -#Flag to disable MSCCL -#export RCCL_MSCCL_ENABLE=0 -#HIPGraph performance flags -export HIP_FORCE_DEV_KERNARG=1 -export DEBUG_CLR_GRAPH_PACKET_CAPTURE=1 -#Enable full decoder graph mode -#Use top of tree build of RCCL -export LD_LIBRARY_PATH=/workspace/rccl/build/ -#Enable either flag to create a profile trace (rocprof, or rocpd) -#RPD_PROFILE="--rpd" -#ROCPROF_PROFILE="rocprof --hip-trace" -GEN_LEN="1,32,128" -INPUT_LEN="512,1024,2048,3072" - -ITER=10 -# pring usage of the parameters -usage() { - echo "Usage: $0 [--tp ] [--model ]" - exit 1 -} -# parse parameters -while [[ "$#" -gt 0 ]]; do - case $1 in - --tp) TP="$2"; shift ;; - --model) MODEL="$2"; shift ;; - --notune) GEMM_TUNER=0; shift ;; - *) usage ;; # Any other argument will show usage information. - esac - shift # Move to next argument -done -for tp in $TP; -do - if (( $GEMM_TUNER )); - then - echo "tuned_gemm_csv: "$VLLM_TUNE_FILE > $VLLM_DIR/tuned_perf_tp$tp.yaml - - if [[ $DELETE_TUNED_CSV == 1 ]]; - then - rm -rf $VLLM_TUNE_FILE - fi - #export VLLM_PERF_YAML=./tuned_perf_tp$tp.yaml - echo "INFO: Generating Tuned Gemm configs" - cd $GRAD_DIR - python gemm_tuner.py --model_dir $MODEL --tuned_file $VLLM_TUNE_FILE --tp $tp - fi - - cd $VLLM_DIR - for gen_len in $GEN_LEN; - do - for input_len in $INPUT_LEN; - do - if [[ -v RPD_PROFILE ]] ; - then - rm /workspace/trace.rpd - python -m rocpd.schema --create /workspace/trace.rpd - fi - echo "================================= RUNNING $MODEL $input_len $gen_len ===============================================" - $ROCPROF_PROFILE torchrun --standalone --nnodes=1 --nproc-per-node=$tp benchmarks/benchmark_latency.py --model $MODEL --batch-size 1 --input-len $input_len --output-len $gen_len \ - --tensor-parallel-size $tp --num-iters $ITER $RPD_PROFILE --report - if [[ -v ROCPROF_PROFILE ]] ; - then - TRACE_FILE=$BASE_DIR/trace_${MODEL_SIZE}_${input_len}_${gen_len}.json - echo "INFO: Creating Trace JSON file $TRACE_FILE" - mv $VLLM_DIR/results.json $TRACE_FILE - fi - if [[ -v RPD_PROFILE ]] ; - then - TRACE_FILE=$BASE_DIR/trace_${MODEL_SIZE}_${input_len}_${gen_len}.json - echo "INFO: Creating Trace JSON file $TRACE_FILE" - python $RPD_DIR/tools/rpd2tracing.py --format object $BASE_DIR/trace.rpd $TRACE_FILE - fi - done - done -done diff --git a/vllm/config.py b/vllm/config.py index 787b7feb737cd..de687395a0001 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -436,7 +436,7 @@ def __init__( self.world_size = pipeline_parallel_size * self.tensor_parallel_size # Ray worker is not supported for Neuron backend. if self.world_size > 1 and not is_neuron(): - self.worker_use_ray = False + self.worker_use_ray = True self._verify_args() def _verify_args(self) -> None: diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py index 123b02c8d6cc8..742f3dc575190 100644 --- a/vllm/engine/ray_utils.py +++ b/vllm/engine/ray_utils.py @@ -1,6 +1,5 @@ import pickle -import socket from typing import Optional, List, Tuple from vllm.config import ParallelConfig @@ -67,12 +66,7 @@ def execute_model_compiled_dag_remote(self, ignored): RayWorkerVllm = None -def get_open_port(): - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.bind(("", 0)) - return s.getsockname()[1] - -def initialize_cluster( +def initialize_ray_cluster( parallel_config: ParallelConfig, ray_address: Optional[str] = None, ): @@ -100,18 +94,10 @@ def initialize_cluster( else: ray.init(address=ray_address, ignore_reinit_error=True) - # if not parallel_config.worker_use_ray: - # assert parallel_config.world_size == 1, ( - # "Ray is required if parallel_config.world_size > 1.") - # return None - if not parallel_config.worker_use_ray: - # Initialize cluster locally. - port = get_open_port() - # We need to setup the distributed init method to make sure - # the distributed megatron code (e.g., get world size) works correctly. - distributed_init_method = f"tcp://localhost:{port}" - return distributed_init_method, None - + if parallel_config.placement_group: + # Placement group is already set. + return + # Create placement group for worker processes current_placement_group = ray.util.get_current_placement_group() if current_placement_group: diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 6e2dbefa44a49..40e681df48f86 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -13,8 +13,6 @@ divide, split_tensor_along_last_dim) from vllm.model_executor.utils import set_weight_attrs from vllm.logger import init_logger -from vllm.model_executor.layers.tuned_gemm import tgemm - logger = init_logger(__name__) @@ -78,9 +76,7 @@ def apply_weights(self, if bias is not None: return F.linear(x, weight) + bias return F.linear(x, weight) - #tgemm.mm(x,weight) - #return F.linear(x, weight, bias) - return tgemm.mm(x,weight) + return F.linear(x, weight, bias) class ReplicatedLinear(torch.nn.Module): @@ -132,7 +128,6 @@ def __init__( def forward(self, x: torch.Tensor) -> torch.Tensor: bias = self.bias if not self.skip_bias_add else None output = self.linear_method.apply_weights(self.linear_weights, x, bias) - #print(f">>> output is {output}") output_bias = self.bias if self.skip_bias_add else None return output, output_bias @@ -580,7 +575,7 @@ def forward(self, input_): output_ = tensor_model_parallel_all_reduce(output_parallel) else: output_ = output_parallel - #print(f">>> ROWPARALLEL {output_.shape}") + if not self.skip_bias_add: output = output_ + self.bias if self.bias is not None else output_ output_bias = None diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index a7adacea7716d..4377b845df628 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -43,8 +43,7 @@ def __init__(self, def _get_logits(self, hidden_states: torch.Tensor, embedding: torch.Tensor, embedding_bias: Optional[torch.Tensor]) -> torch.Tensor: # Get the logits for the next tokens. - #logits = torch.matmul(hidden_states, embedding.t()) - logits = tgemm.mm(hidden_states, embedding) + logits = torch.matmul(hidden_states, embedding.t()) if embedding_bias is not None: logits += embedding_bias logits = tensor_model_parallel_gather(logits) diff --git a/vllm/model_executor/layers/tuned_gemm.py b/vllm/model_executor/layers/tuned_gemm.py deleted file mode 100644 index bebab27ebfd86..0000000000000 --- a/vllm/model_executor/layers/tuned_gemm.py +++ /dev/null @@ -1,111 +0,0 @@ -import torch -import torch.nn.functional as F -from rocsolidxgemm import rocb_create_extension,rocb_mm -from hipbsolidxgemm import hipb_create_extension,hipb_mm -from pathlib import Path -import os -import yaml -import pandas as pd -from vllm import custom_ops - - -class TunedGemm: - def __init__(self): - #rocb_create_extension() - #hipb_create_extension() - self.extensions_created = False - self.save_gemm = int(os.environ.get('VLLM_TUNE_GEMM',0)) - self.untune_path = os.environ.get('VLLM_UNTUNE_FILE', "/tmp/vllm_untuned.csv") - self.tune_path = os.environ.get('VLLM_TUNE_FILE', "tuned.csv") - self.bestsols = {} - self.load_best_sols() - self.create_ds() - - - if (self.save_gemm == 1): - self.tuned_df = pd.DataFrame(columns=['M','N','K']) - else: - self.tuned_df = None - - def load_best_sols(self): - if self.tune_path is not None and Path(self.tune_path).is_file(): - self.bestsols = pd.read_csv(self.tune_path) - - def apply_custom(self,ds): - M,N,K = ds['M'],ds['N'],ds['K'] - #apply custom matvec (only for f16 dtype) - if N==1: - ds1 = ds.copy() - ds1['libtype'] = 'custom' - if K==8192 and (M==1280 or M==7168): - ds1['solidx'] = 8 - return ds1 - elif K==3584 and M==8192: - ds1['solidx'] = 8 - return ds1 - elif K<=8192 and K%8==0 and M%4==0: - ds1['solidx'] = 1 - return ds1 - return ds - def create_ds(self): - df = self.bestsols - solds = {} - for i in range(len(df)): - ds = self.apply_custom(df.iloc[i]) - key = (ds['M'],ds['N'],ds['K']) - if ds['libtype']=='hipblaslt': soltype = 1 - elif ds['libtype']=='rocblas': soltype = 2 - elif ds['libtype']=='custom': soltype = 3 - solds[key] = (soltype,int(ds['solidx'])) - self.solids = solds - #print('>>>',solds) - def query_sol(self,m,n,k): - return self.solids.get((m,n,k),(0,0)) - def mm(self,inp,weights): - # F.Linear can take a 3 dimensional input. vllm uses this for linear units. - # However, sampler will use torch.matmul with 2 dimensions only - if inp.dim() == 3: - inp_view=inp.view(-1,inp.size(-1)) - batched = True - else: - inp_view = inp - batched = False - #print(f'>>>inp_view {inp_view.shape}') - if self.extensions_created == False: - rocb_create_extension() - hipb_create_extension() - self.extensions_created = True - soltype,solidx = self.query_sol(m=weights.shape[0],n=inp_view.shape[0],k=inp_view.shape[1]) - if soltype==1: - #print(">>> found hipblas") - out = hipb_mm(inp_view,weights.t(),solidx) - elif soltype==3: - ##only matvec is supported currently - out = torch.empty(inp.shape[0],weights.shape[0],dtype=torch.float16,device='cuda') - #print('>>>Matvec',inp.shape,weights.shape,soltype,solidx) - if solidx<=1: - custom_ops.LLMM1(weights,inp,out,4) - elif solidx==2: - custom_ops.LLMM1(weights,inp,out,2) - elif solidx==8: - custom_ops.LLMM1(weights,inp,out,8) - elif solidx==20: - custom_ops.LLZZ(weights,inp,out,0) - elif solidx==21: - custom_ops.LLZZ(weights,inp,out,1) - elif soltype==2: - #print(">>> found rocblas") - out = rocb_mm(inp_view,weights.t(),solidx) - else: - - if (self.save_gemm == 1): - print('>>>Tgemm Default',inp_view.shape, inp.shape,weights.shape,soltype,solidx) - self.tuned_df = pd.concat([self.tuned_df, pd.DataFrame({'M':[weights.shape[0]], 'N':[inp.shape[0]*inp.shape[1]], 'K':[weights.shape[1]]})]).drop_duplicates() - self.tuned_df.to_csv(self.untune_path, index=False) - out = F.linear(inp,weights) - if batched: - return out.view(inp.shape[0], inp.shape[1], weights.shape[0]) - else: - return out - -tgemm = TunedGemm() diff --git a/vllm/model_executor/parallel_utils/communication_op.py b/vllm/model_executor/parallel_utils/communication_op.py index 58bf6c2d97e24..521b6b8a383b0 100644 --- a/vllm/model_executor/parallel_utils/communication_op.py +++ b/vllm/model_executor/parallel_utils/communication_op.py @@ -79,7 +79,6 @@ def tensor_model_parallel_gather(input_: torch.Tensor, all the ranks. """ world_size = get_tensor_model_parallel_world_size() - # Bypass the function if we are using only 1 GPU. if world_size == 1: return input_ @@ -89,22 +88,19 @@ def tensor_model_parallel_gather(input_: torch.Tensor, # Convert negative dim to positive. dim += input_.dim() # Allocate output tensor. - gather_list = [torch.empty_like(input_) for _ in range(world_size)] - # if get_tensor_model_parallel_rank() == dst: - # gather_list = [torch.empty_like(input_) for _ in range(world_size)] - # else: - # gather_list = None + if get_tensor_model_parallel_rank() == dst: + gather_list = [torch.empty_like(input_) for _ in range(world_size)] + else: + gather_list = None # Gather. - - #print(f'>>> world size {world_size}, {gather_list}, {dst} {get_tensor_model_parallel_group()}') - torch.distributed.all_gather(gather_list, input_, + torch.distributed.gather(input_, + gather_list, + dst=dst, group=get_tensor_model_parallel_group()) - output_tensor = torch.cat(gather_list, dim=dim) - # if get_tensor_model_parallel_rank() == dst: - # output_tensor = torch.cat(gather_list, dim=dim) - # else: - # output_tensor = None - #print(f'>>> output_tensor {output_tensor}, {dst}, {dim}') + if get_tensor_model_parallel_rank() == dst: + output_tensor = torch.cat(gather_list, dim=dim) + else: + output_tensor = None return output_tensor diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 593d5aadcd2ab..7eac576e3f0fe 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -31,7 +31,6 @@ # Capture graphs for batch size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256. # NOTE: _get_graph_batch_size needs to be updated if this list is changed. _BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [8 * i for i in range(1, 33)] -#_BATCH_SIZES_TO_CAPTURE = [1] class ModelRunner: @@ -537,7 +536,7 @@ def prepare_input_tensors( "lora_requests": lora_requests, "lora_mapping": lora_mapping, } - #broadcast_tensor_dict(metadata_dict, src=0) + broadcast_tensor_dict(metadata_dict, src=0) else: metadata_dict = broadcast_tensor_dict(src=0) input_tokens = metadata_dict["input_tokens"] diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index e201b484aa070..0dcd4018afa5f 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -51,9 +51,6 @@ def __init__( self.distributed_init_method = distributed_init_method self.lora_config = lora_config self.is_driver_worker = is_driver_worker - local_rank = int(os.getenv("LOCAL_RANK", "0")) - self.local_rank = local_rank - if self.is_driver_worker: assert self.rank == 0, "The driver worker must have rank 0." @@ -83,12 +80,7 @@ def init_model(self, cupy_port: Optional[int] = None) -> None: # This env var set by Ray causes exceptions with graph building. os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None) - self.rank = self.rank if self.rank is not None else int( - os.getenv("RANK", "-1")) - self.device = torch.device(f"cuda:{self.local_rank}") - - torch.cuda.set_device(self.device) _check_if_gpu_supports_dtype(self.model_config.dtype) @@ -201,7 +193,7 @@ def execute_model( blocks_to_swap_out: Optional[Dict[int, int]] = None, blocks_to_copy: Optional[Dict[int, List[int]]] = None, ) -> Optional[SamplerOutput]: - if self.is_driver_worker and self.rank == 0: + if self.is_driver_worker: assert seq_group_metadata_list is not None num_seq_groups = len(seq_group_metadata_list) assert blocks_to_swap_in is not None @@ -213,7 +205,7 @@ def execute_model( "blocks_to_swap_out": blocks_to_swap_out, "blocks_to_copy": blocks_to_copy, } - #broadcast_tensor_dict(data, src=0) + broadcast_tensor_dict(data, src=0) else: data = broadcast_tensor_dict(src=0) num_seq_groups = data["num_seq_groups"] @@ -281,25 +273,6 @@ def init_distributed_environment( world_size=parallel_config.world_size, rank=rank, init_method=distributed_init_method, - #init_method="env://", - ) - - if cupy_utils.is_initialized(): - cupy_world_size = cupy_utils.get_world_size() - if cupy_world_size != parallel_config.world_size: - raise RuntimeError( - "cupy.distributed is already initialized but the cupy world " - "size does not match parallel_config.world_size " - f"({cupy_world_size} vs. {parallel_config.world_size}).") - elif (parallel_config.world_size > 1 and cupy_port is not None): - # NOTE(woosuk): We don't initialize CuPy process group when world size - # is 1. - # TODO(woosuk): Support multi-node connection. - cupy_utils.init_process_group( - world_size=parallel_config.world_size, - rank=rank, - host="localhost", - port=cupy_port, ) if cupy_utils.is_initialized(): From 1510843e0036b57b046da1ca373e4f128eb4edf7 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Mon, 18 Mar 2024 21:05:32 +0000 Subject: [PATCH 144/159] Initializing scaling factors for kv cache in flash attention backend --- .../layers/attention/attention.py | 31 ++++++++++++++++--- .../layers/attention/backends/flash_attn.py | 12 ++++++- vllm/model_executor/models/llama.py | 6 ++-- 3 files changed, 40 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py index 99ee67b288f90..db3bfceb66160 100644 --- a/vllm/model_executor/layers/attention/attention.py +++ b/vllm/model_executor/layers/attention/attention.py @@ -34,11 +34,9 @@ def __init__( sliding_window: Optional[int] = None, ) -> None: super().__init__() - if (torch.cuda.get_device_capability()[0] >= 8 and - torch.get_default_dtype() in (torch.float16, torch.bfloat16)): - # Ampere or later NVIDIA GPUs. - # NOTE(woosuk): FlashAttention does not support FP32. - from vllm.model_executor.layers.attention.backends.flash_attn import FlashAttentionBackend + if _use_flash_attn(): + + from vllm.model_executor.layers.attention.backends.flash_attn import FlashAttentionBackend # noqa: E501 self.backend = FlashAttentionBackend(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window) @@ -59,3 +57,26 @@ def forward( ) -> torch.Tensor: return self.backend.forward(query, key, value, key_cache, value_cache, input_metadata) + + +@lru_cache(maxsize=1) +def _use_flash_attn() -> bool: + try: + import flash_attn # noqa: F401 + except ImportError: + logger.info("flash_attn is not found. Using xformers backend.") + return False + + if torch.cuda.get_device_capability()[0] < 8: + # Volta and Turing NVIDIA GPUs. + logger.info("flash_attn is not supported on Turing or older GPUs. " + "Using xformers backend.") + return False + if torch.get_default_dtype() not in (torch.float16, torch.bfloat16): + logger.info( + "flash_attn only supports torch.float16 or torch.bfloat16. " + "Using xformers backend.") + return False + + logger.info("Using flash_attn backend.") + return True diff --git a/vllm/model_executor/layers/attention/backends/flash_attn.py b/vllm/model_executor/layers/attention/backends/flash_attn.py index f8dc1ca460ca6..a5db2a6797439 100644 --- a/vllm/model_executor/layers/attention/backends/flash_attn.py +++ b/vllm/model_executor/layers/attention/backends/flash_attn.py @@ -38,6 +38,14 @@ def __init__( if alibi_slopes is not None: alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) self.alibi_slopes = alibi_slopes + # This will be set to a float by model initialization per attention, + # if and only if we are using it. N.B. currently we only support per + # tensor scalar scaling factors & only applicable to ROCm (AMD GPU). + # The scaling factor convention we are assuming is + # quantized_value * scaling_factor ~= true_value + # which is consistent with the practice of setting + # scaling_factor = tensor_amax / FPtype_max + self.kv_cache_scaling_factor = 1.0 assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads @@ -85,7 +93,8 @@ def forward( # profiling run. if key_cache is not None and value_cache is not None: PagedAttentionImpl.reshape_and_cache(key, value, key_cache, - value_cache, input_metadata) + value_cache, input_metadata, + self.kv_cache_scaling_factor) if input_metadata.is_prompt: # Prompt run. @@ -136,6 +145,7 @@ def forward( self.num_kv_heads, self.scale, self.alibi_slopes, + self.kv_cache_scaling_factor, ) # Reshape the output tensor. diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 07e75f7dc5cb4..6eb645ca675d9 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -411,7 +411,7 @@ def load_kv_cache_scales(self, scales_path: str) -> None: for layer_idx, scaling_factor in kv_cache_scales_loader( scales_path, tp_rank, tp_size, self.config.num_hidden_layers, self.config.__class__.model_type): - layer_paged_attn = ( + layer_attn = ( self.model.layers[layer_idx].self_attn.attn.backend) if is_hip(): @@ -420,8 +420,8 @@ def load_kv_cache_scales(self, scales_path: str) -> None: # which is consistent with the practice of setting # scaling_factor = tensor_amax / FPtype_max scaling_factor *= 2 - if hasattr(layer_paged_attn, "kv_cache_scaling_factor"): - layer_paged_attn.kv_cache_scaling_factor = scaling_factor + if hasattr(layer_attn, "kv_cache_scaling_factor"): + layer_attn.kv_cache_scaling_factor = scaling_factor else: raise RuntimeError("PagedAttention has no KV cache scaling " "factor attribute!") From 3d82eea53adcc09c2e1ae546fb3747742ca86d91 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Tue, 19 Mar 2024 14:24:49 +0000 Subject: [PATCH 145/159] Removed obsolete parts. Made rocm attention defaults define guarded --- Dockerfile.rocm | 62 +++-- benchmarks/benchmark_latency.py | 168 +++++--------- csrc/attention/attention_kernels.cu | 9 + csrc/cache.h | 7 - csrc/cache_kernels.cu | 161 ------------- csrc/pybind.cpp | 4 - vllm/model_executor/models/internlm.py | 299 ------------------------- 7 files changed, 92 insertions(+), 618 deletions(-) delete mode 100644 vllm/model_executor/models/internlm.py diff --git a/Dockerfile.rocm b/Dockerfile.rocm index ec669ca89c9b8..a143f37ab4f2f 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -1,9 +1,14 @@ -FROM rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1 -ENV WORKSPACE_DIR=/workspace -RUN mkdir -p $WORKSPACE_DIR -WORKDIR $WORKSPACE_DIR -# Limit arch's so composable kernel doesn't take days to finish -ENV PYTORCH_ROCM_ARCH=gfx90a;gfx942 +# default base image +ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" + +FROM $BASE_IMAGE + +RUN echo "Base image is $BASE_IMAGE" + +# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" +# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" + + ARG FA_GFX_ARCHS="gfx90a;gfx942" RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS" @@ -22,22 +27,9 @@ ARG BUILD_CUPY="1" # whether to build triton on rocm ARG BUILD_TRITON="1" -# Install some basic utilities -RUN apt-get update && apt-get install python3 python3-pip -y - # Install some basic utilities RUN apt-get update && apt-get install -y \ - curl \ - ca-certificates \ - sudo \ - git \ - bzip2 \ - libx11-6 \ - build-essential \ - wget \ - unzip \ - nvidia-cuda-toolkit \ - tmux \ + sqlite3 libsqlite3-dev libfmt-dev \ && rm -rf /var/lib/apt/lists/* ### Mount Point ### @@ -60,6 +52,8 @@ RUN if [ "$BUILD_FA" = "1" ]; then \ && cd libs \ && git clone https://github.com/ROCm/flash-attention.git \ && cd flash-attention \ + && git checkout ${FA_BRANCH} \ + && git submodule update --init \ && export GPU_ARCHS=${FA_GFX_ARCHS} \ && if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \ patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \ @@ -94,10 +88,11 @@ RUN if [ "$BUILD_TRITON" = "1"]; then \ mkdir -p libs \ && cd libs \ && pip uninstall -y triton \ - && git clone https://github.com/ROCmSoftwarePlatform/triton.git + && git clone https://github.com/ROCm/triton.git \ && cd triton/python \ && pip3 install -e . \ - && cd ../..; \ + && cd ../.. \ + && rm -r triton; \ fi COPY ./ /app/vllm @@ -105,16 +100,17 @@ COPY ./ /app/vllm RUN python3 -m pip install --upgrade pip RUN python3 -m pip install xformers==0.0.23 --no-deps -RUN cd vllm \ - && pip install -r requirements-rocm.txt \ - && pip install typing-extensions==4.8.0 \ - && bash patch_xformers.rocm.sh \ - && cd gradlib && python setup.py develop && cd ../ \ - && python setup.py build && python setup.py develop; exit 0 - -RUN pip install pyarrow Ray pandas==2.0 numpy==1.20.3 +RUN cd /app \ + && cd vllm \ + && pip install -U -r requirements-rocm.txt \ + && if [ "$BUILD_FA" = "1" ]; then \ + bash patch_xformers.rocm.sh; fi \ + && if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \ + patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch; fi \ + && python3 setup.py install \ + && cd .. -RUN git clone https://github.com/ROCmSoftwarePlatform/rocmProfileData.git \ - && cd rocmProfileData && make; make install +RUN python3 -m pip install --upgrade pip +RUN python3 -m pip install --no-cache-dir ray[all] -WORKDIR /workspace/vllm +CMD ["/bin/bash"] diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index e940c26f24ed9..0eabd1f66ffc5 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -3,23 +3,17 @@ import time from pathlib import Path from typing import Optional -import pandas as pd + import numpy as np import torch from tqdm import tqdm from vllm import LLM, SamplingParams -from torch.profiler import profile, record_function, ProfilerActivity -def list_of_ints(arg): - return list(map(int, arg.split(','))) def main(args: argparse.Namespace): print(args) - print(f'>>>Loading LLM') - if args.report: - results_df = pd.DataFrame(columns=['model', 'batch', 'tp', 'input', 'output', 'latency']) # NOTE(woosuk): If the request cannot be processed in a single batch, # the engine will automatically process the request in multiple batches. llm = LLM( @@ -36,101 +30,60 @@ def main(args: argparse.Namespace): ray_workers_use_nsight=args.ray_workers_use_nsight, ) - for batch_size in args.batch_size: - for output_len in args.output_len: - for input_len in args.input_len: - print(f'>>>RUNNING {args.model} Batch_size:{batch_size} Input_len:{input_len} Output_len:{output_len}') - sampling_params = SamplingParams( - n=args.n, - temperature=0.0 if args.use_beam_search else 1.0, - top_p=1.0, - use_beam_search=args.use_beam_search, - ignore_eos=True, - max_tokens=output_len, - ) - print(sampling_params) - dummy_prompt_token_ids = [[0] * input_len] * batch_size - dummy_prompts = [] - dummy_prompts.append('DeepSpeed is a machine learning library that deep learning practitioners should use for what purpose') - - def run_to_completion(profile_dir: Optional[str] = None): - if profile_dir: - with torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - on_trace_ready=torch.profiler.tensorboard_trace_handler( - str(profile_dir))) as p: - llm.generate(prompt_token_ids=dummy_prompt_token_ids, - sampling_params=sampling_params, - use_tqdm=False) - print(p.key_averages()) - elif args.accuracy: - start_time = time.perf_counter() - rsp = llm.generate( - #prompt_token_ids=dummy_prompt_token_ids, - prompts=dummy_prompts, - sampling_params=sampling_params, - use_tqdm=False) - end_time = time.perf_counter() - latency = end_time - start_time - print('>>Rsp', rsp[0].outputs) - return latency - else: - start_time = time.perf_counter() - rsp = llm.generate(prompt_token_ids=dummy_prompt_token_ids, - sampling_params=sampling_params, - use_tqdm=False) - end_time = time.perf_counter() - latency = end_time - start_time - print('>>Rsp', rsp[0].outputs) - return latency - - print("Warming up...") - run_to_completion(profile_dir=None) - - if (args.warmup_only): - - print(">>> Warmup only specified, exiting") - continue + sampling_params = SamplingParams( + n=args.n, + temperature=0.0 if args.use_beam_search else 1.0, + top_p=1.0, + use_beam_search=args.use_beam_search, + ignore_eos=True, + max_tokens=args.output_len, + ) + print(sampling_params) + dummy_prompt_token_ids = np.random.randint(10000, + size=(args.batch_size, + args.input_len)) + dummy_prompt_token_ids = dummy_prompt_token_ids.tolist() - if args.profile: - profile_dir = args.profile_result_dir - if not profile_dir: - profile_dir = Path( - "." - ) / "vllm_benchmark_result" / f"latency_result_{time.time()}" - print(f"Profiling (results will be saved to '{profile_dir}')...") - run_to_completion(profile_dir=args.profile_result_dir) - return - if args.rpd: - from rpdTracerControl import rpdTracerControl - rpdTracerControl.setFilename(name = "/workspace/trace.rpd", append=True) - profile_rpd = rpdTracerControl() - profile_rpd.start() - print(f"RPD Profiling'...") - with torch.autograd.profiler.emit_nvtx(): - run_to_completion(profile_dir=None) - profile_rpd.stop() - return + def run_to_completion(profile_dir: Optional[str] = None): + if profile_dir: + with torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + on_trace_ready=torch.profiler.tensorboard_trace_handler( + str(profile_dir))) as p: + llm.generate(prompt_token_ids=dummy_prompt_token_ids, + sampling_params=sampling_params, + use_tqdm=False) + print(p.key_averages()) + else: + start_time = time.perf_counter() + llm.generate(prompt_token_ids=dummy_prompt_token_ids, + sampling_params=sampling_params, + use_tqdm=False) + end_time = time.perf_counter() + latency = end_time - start_time + return latency - # Benchmark. - latencies = [] - for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): - latencies.append(run_to_completion(profile_dir=None)) + print("Warming up...") + run_to_completion(profile_dir=None) - if torch.distributed.get_rank() == 0: - #results_df = pd.DataFrame(columns=['model', 'batch', 'tp', 'input', 'output', 'latency']) - latency=np.mean(latencies) - print(f'Avg latency: {latency} seconds') - if args.report: - entry = {'model':[args.model], 'tp':[args.tensor_parallel_size],'batch':[batch_size], 'input':[input_len], 'output':[output_len], 'latency':[latency]} - results_df = pd.concat([results_df, pd.DataFrame(entry)], ignore_index=True) - if torch.distributed.get_rank() == 0 and args.report: - print(results_df) - results_df.to_csv(args.report_file, index=False) + if args.profile: + profile_dir = args.profile_result_dir + if not profile_dir: + profile_dir = Path( + "." + ) / "vllm_benchmark_result" / f"latency_result_{time.time()}" + print(f"Profiling (results will be saved to '{profile_dir}')...") + run_to_completion(profile_dir=profile_dir) + return + # Benchmark. + latencies = [] + for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): + latencies.append(run_to_completion(profile_dir=None)) + print(f'Avg latency: {np.mean(latencies)} seconds') if __name__ == '__main__': @@ -144,9 +97,9 @@ def run_to_completion(profile_dir: Optional[str] = None): choices=['awq', 'gptq', 'squeezellm', None], default=None) parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) - parser.add_argument('--input-len', type=list_of_ints, default=32) - parser.add_argument('--output-len', type=list_of_ints, default=128) - parser.add_argument('--batch-size', type=list_of_ints, default=8) + parser.add_argument('--input-len', type=int, default=32) + parser.add_argument('--output-len', type=int, default=128) + parser.add_argument('--batch-size', type=int, default=8) parser.add_argument('--n', type=int, default=1, @@ -159,7 +112,6 @@ def run_to_completion(profile_dir: Optional[str] = None): parser.add_argument('--trust-remote-code', action='store_true', help='trust remote code from huggingface') - parser.add_argument( '--dtype', type=str, @@ -172,9 +124,6 @@ def run_to_completion(profile_dir: Optional[str] = None): parser.add_argument('--enforce-eager', action='store_true', help='enforce eager mode and disable CUDA graph') - parser.add_argument('--accuracy', - action='store_true', - help='Run an Actual query through vllm') parser.add_argument( "--kv-cache-dtype", type=str, @@ -216,14 +165,5 @@ def run_to_completion(profile_dir: Optional[str] = None): action='store_true', help="If specified, use nsight to profile ray workers", ) - parser.add_argument( - '--rpd', - action='store_true', - help='profile the generation process of a single batch using the rpd tracer') - parser.add_argument('--warmup-only', action='store_true', - help='only run warmup, useful for tuning') - parser.add_argument('--report', action='store_true', - help='turn on dataframe reporting') - parser.add_argument('--report-file', type=str, default=None) args = parser.parse_args() main(args) diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index 11524cda2041a..7005509094fec 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -629,7 +629,11 @@ template< typename CACHE_T, int BLOCK_SIZE, bool IS_FP8_KV_CACHE, +#ifdef USE_ROCM int NUM_THREADS = 1024> +#else + int NUM_THREADS = 128> +#endif void paged_attention_v1_launcher( torch::Tensor& out, torch::Tensor& query, @@ -810,8 +814,13 @@ template< typename CACHE_T, int BLOCK_SIZE, bool IS_FP8_KV_CACHE, +#ifdef USE_ROCM + int NUM_THREADS = 128, + int PARTITION_SIZE = 512> +#else int NUM_THREADS = 1024, int PARTITION_SIZE = 1024> +#endif void paged_attention_v2_launcher( torch::Tensor& out, torch::Tensor& exp_sums, diff --git a/csrc/cache.h b/csrc/cache.h index 82b90eb4ab631..718a5f6cfd7f7 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -24,13 +24,6 @@ void reshape_and_cache( const std::string& kv_cache_dtype, const float kv_scale); -void gather_cached_kv( - torch::Tensor& key, - torch::Tensor& value, - torch::Tensor& key_cache, - torch::Tensor& value_cache, - torch::Tensor& slot_mapping); - // Just for unittest void convert_fp8( torch::Tensor& src_cache, diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 73f61e92b1a51..24aaa2ff3e263 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -277,167 +277,6 @@ void reshape_and_cache( namespace vllm { -// Grid: (num_blocks, block_size). -template -__global__ void gather_cached_kv_kernel( - scalar_t* __restrict__ key, // [num_tokens, [stride], num_heads, head_size] - scalar_t* __restrict__ value, // [num_tokens, [stride], num_heads, head_size] - const scalar_t* __restrict__ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] - const scalar_t* __restrict__ value_cache, // [num_blocks, num_heads, head_size, block_size] - const int* __restrict__ slot_mapping, // [num_tokens] - const int key_stride, - const int value_stride, - const int num_heads, - const int head_size, - const int block_size, - const int x) { - const int token_idx = blockIdx.x; - const int slot_idx = slot_mapping[token_idx]; - const int block_idx = slot_idx / block_size; - const int block_offset = slot_idx % block_size; - - const int num_tokens = num_heads * head_size; - for (int i = threadIdx.x; i < num_tokens; i += blockDim.x) { - const int tgt_key_idx = token_idx * key_stride + i; - const int tgt_value_idx = token_idx * value_stride + i; - - const int head_idx = i / head_size; - const int head_offset = i % head_size; - const int x_idx = head_offset / x; // the offset of the [head_size/x] dimension - const int x_offset = head_offset % x; - - const int src_key_idx = block_idx * num_heads * (head_size / x) * block_size * x - + head_idx * (head_size / x) * block_size * x - + x_idx * block_size * x - + block_offset * x - + x_offset; - const int src_value_idx = block_idx * num_heads * head_size * block_size - + head_idx * head_size * block_size - + head_offset * block_size - + block_offset; - - key[tgt_key_idx] = VLLM_LDG(&key_cache[src_key_idx]); - value[tgt_value_idx] = VLLM_LDG(&value_cache[src_value_idx]); - } -} - -template -__global__ void gather_cached_kv_kernel_optimized( - scalar_t *__restrict__ key, // [num_tokens, [stride], num_heads, head_size] - scalar_t *__restrict__ value, // [num_tokens, [stride], num_heads, head_size] - const scalar_t *__restrict__ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] - const scalar_t *__restrict__ value_cache, // [num_blocks, num_heads, head_size, block_size] - const int *__restrict__ slot_mapping, // [num_tokens] - const int key_stride, - const int value_stride, - const int num_heads, - const int head_size, - const int block_size, - const int x) -{ - const int token_idx = blockIdx.x; - const int slot_idx = slot_mapping[token_idx]; - const int block_idx = slot_idx / block_size; - const int block_offset = slot_idx % block_size; - - const int dim = num_heads * head_size; - assert(dim % 4 == 0); // this is true for known use cases - const int unroll_factor = 4; - const int unrolled_dim = dim / unroll_factor; - - for (int i = threadIdx.x; i < unrolled_dim; i += blockDim.x) - { - int tgt_key_indices[unroll_factor]; - int tgt_value_indices[unroll_factor]; - int src_key_indices[unroll_factor]; - int src_value_indices[unroll_factor]; - scalar_t keys_to_store[unroll_factor]; - scalar_t values_to_store[unroll_factor]; - - #pragma unroll - for (int j = 0; j < unroll_factor; ++j) - { - int index = i + j * unrolled_dim; - - const int tgt_key_idx = token_idx * key_stride + index; - const int tgt_value_idx = token_idx * value_stride + index; - - const int head_idx = index / head_size; - const int head_offset = index % head_size; - const int x_idx = head_offset / x; - const int x_offset = head_offset % x; - - const int src_key_idx = block_idx * num_heads * (head_size / x) * block_size * x - + head_idx * (head_size / x) * block_size * x - + x_idx * block_size * x - + block_offset * x - + x_offset; - const int src_value_idx = block_idx * num_heads * head_size * block_size - + head_idx * head_size * block_size - + head_offset * block_size - + block_offset; - - tgt_key_indices[j] = tgt_key_idx; - tgt_value_indices[j] = tgt_value_idx; - src_key_indices[j] = src_key_idx; - src_value_indices[j] = src_value_idx; - - keys_to_store[j] = VLLM_LDG(&key_cache[src_key_idx]); - values_to_store[j] = VLLM_LDG(&value_cache[src_value_idx]); - } - - #pragma unroll - for (int j = 0; j < unroll_factor; ++j) - { - key[tgt_key_indices[j]] = keys_to_store[j]; - value[tgt_value_indices[j]] = values_to_store[j]; - } - } -} - -} // namespace vllm - -void gather_cached_kv( - torch::Tensor& key, // [out] [num_tokens, num_heads, head_size] - torch::Tensor& value, // [out] [num_tokens, num_heads, head_size] - torch::Tensor& key_cache, // [in] [num_blocks, num_heads, head_size/x, block_size, x] - torch::Tensor& value_cache, // [in] [num_blocks, num_heads, head_size, block_size] - torch::Tensor& slot_mapping) // [in] [num_tokens] -{ - int num_tokens = key.size(0); - int num_heads = key.size(1); - int head_size = key.size(2); - int block_size = key_cache.size(3); - int x = key_cache.size(4); - - int key_stride = key.stride(0); - int value_stride = value.stride(0); - - dim3 grid(num_tokens); - dim3 block(std::min(num_heads * head_size, 512)); - const at::cuda::OptionalCUDAGuard device_guard(device_of(key)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES( - key.scalar_type(), - "gather_cached_kv_kernel_optimized", - [&] { - vllm::gather_cached_kv_kernel_optimized<<>>( - key.data_ptr(), - value.data_ptr(), - key_cache.data_ptr(), - value_cache.data_ptr(), - slot_mapping.data_ptr(), - key_stride, - value_stride, - num_heads, - head_size, - block_size, - x); - }); -} - -namespace vllm { - template __global__ void convert_fp8_kernel( const Tin* __restrict__ src_cache, diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp index d69fc63c62716..de02afc162113 100644 --- a/csrc/pybind.cpp +++ b/csrc/pybind.cpp @@ -90,10 +90,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { "reshape_and_cache", &reshape_and_cache, "Reshape the key and value tensors and cache them"); - cache_ops.def( - "gather_cached_kv", - &gather_cached_kv, - "Gather key and value from the cache into contiguous QKV tensors"); cache_ops.def( "convert_fp8", &convert_fp8, diff --git a/vllm/model_executor/models/internlm.py b/vllm/model_executor/models/internlm.py deleted file mode 100644 index 5d0b93793c89d..0000000000000 --- a/vllm/model_executor/models/internlm.py +++ /dev/null @@ -1,299 +0,0 @@ -# -*- coding: utf-8 -*- -from typing import Any, Dict, List, Optional, Tuple - -import torch -from torch import nn -from transformers import LlamaConfig - -from vllm.model_executor.input_metadata import InputMetadata -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.attention import PagedAttention -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (LinearMethodBase, - MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding, ParallelLMHead) -from vllm.model_executor.parallel_utils.parallel_state import ( - get_tensor_model_parallel_world_size) -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.weight_utils import (default_weight_loader, - hf_model_weights_iterator) -from vllm.sequence import SamplerOutput - -KVCache = Tuple[torch.Tensor, torch.Tensor] - - -class InternLMMLP(nn.Module): - - def __init__( - self, - hidden_size: int, - intermediate_size: int, - hidden_act: str, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, - bias=False, - linear_method=linear_method) - self.down_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=False, - linear_method=linear_method) - if hidden_act != "silu": - raise ValueError(f"Unsupported activation: {hidden_act}. " - "Only silu is supported for now.") - self.act_fn = SiluAndMul() - - def forward(self, x): - gate_up, _ = self.gate_up_proj(x) - x = self.act_fn(gate_up) - x, _ = self.down_proj(x) - return x - - -class InternLMAttention(nn.Module): - - def __init__( - self, - hidden_size: int, - num_heads: int, - bias: bool, - rope_theta: float = 10000, - max_position_embeddings: int = 8192, - linear_method: Optional[LinearMethodBase] = None, - rope_scaling: Optional[Dict[str, Any]] = None, - ): - super().__init__() - self.hidden_size = hidden_size - tensor_model_parallel_world_size = ( - get_tensor_model_parallel_world_size()) - self.total_num_heads = num_heads - assert self.total_num_heads % tensor_model_parallel_world_size == 0 - self.num_heads = (self.total_num_heads // - tensor_model_parallel_world_size) - self.head_dim = hidden_size // self.total_num_heads - self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta - self.max_position_embeddings = max_position_embeddings - - self.qkv_proj = QKVParallelLinear( - hidden_size, - self.head_dim, - self.total_num_heads, - bias=bias, - linear_method=linear_method, - ) - self.o_proj = RowParallelLinear( - self.total_num_heads * self.head_dim, - hidden_size, - bias=bias, - linear_method=linear_method, - ) - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.head_dim, - max_position=self.max_position_embeddings, - base=self.rope_theta, - rope_scaling=rope_scaling, - ) - self.attn = PagedAttention(self.num_heads, self.head_dim, self.scaling) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - ) -> torch.Tensor: - qkv, _ = self.qkv_proj(hidden_states) - q, k, v = qkv.chunk(chunks=3, dim=-1) - q, k = self.rotary_emb(positions, q, k) - k_cache, v_cache = kv_cache - attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata) - output, _ = self.o_proj(attn_output) - return output - - -class InternLMDecoderLayer(nn.Module): - - def __init__( - self, - config: LlamaConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - max_position_embeddings = getattr(config, "max_position_embeddings", - 8192) - self.self_attn = InternLMAttention( - hidden_size=self.hidden_size, - num_heads=config.num_attention_heads, - bias=config.bias, - rope_theta=rope_theta, - max_position_embeddings=max_position_embeddings, - linear_method=linear_method, - rope_scaling=getattr(config, "rope_scaling", None), - ) - self.mlp = InternLMMLP( - hidden_size=self.hidden_size, - intermediate_size=config.intermediate_size, - hidden_act=config.hidden_act, - linear_method=linear_method, - ) - self.input_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - self.post_attention_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: KVCache, - input_metadata: InputMetadata, - residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Self Attention - if residual is None: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - else: - hidden_states, residual = self.input_layernorm( - hidden_states, residual) - hidden_states = self.self_attn( - positions=positions, - hidden_states=hidden_states, - kv_cache=kv_cache, - input_metadata=input_metadata, - ) - - # Fully Connected - hidden_states, residual = self.post_attention_layernorm( - hidden_states, residual) - hidden_states = self.mlp(hidden_states) - return hidden_states, residual - - -class InternLMModel(nn.Module): - - def __init__( - self, - config: LlamaConfig, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.config = config - self.padding_idx = config.pad_token_id - self.vocab_size = config.vocab_size - - vocab_size = ((config.vocab_size + 63) // 64) * 64 - self.embed_tokens = VocabParallelEmbedding( - vocab_size, - config.hidden_size, - ) - self.layers = nn.ModuleList([ - InternLMDecoderLayer(config, linear_method) - for _ in range(config.num_hidden_layers) - ]) - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.embed_tokens(input_ids) - residual = None - for i in range(len(self.layers)): - layer = self.layers[i] - hidden_states, residual = layer( - positions, - hidden_states, - kv_caches[i], - input_metadata, - residual, - ) - hidden_states, _ = self.norm(hidden_states, residual) - return hidden_states - - -class InternLMForCausalLM(nn.Module): - - def __init__( - self, - config, - linear_method: Optional[LinearMethodBase] = None, - ): - super().__init__() - self.config = config - self.linear_method = linear_method - self.model = InternLMModel(config, linear_method) - self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) - self.sampler = Sampler(config.vocab_size) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[KVCache], - input_metadata: InputMetadata, - ) -> torch.Tensor: - hidden_states = self.model(input_ids, positions, kv_caches, - input_metadata) - return hidden_states - - def sample( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(self.lm_head.weight, hidden_states, - sampling_metadata) - return next_tokens - - def load_weights(self, - model_name_or_path: str, - cache_dir: Optional[str] = None, - load_format: str = "auto", - revision: Optional[str] = None): - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - for name, loaded_weight in hf_model_weights_iterator( - model_name_or_path, cache_dir, load_format, revision): - if "rotary_emb.inv_freq" in name: - continue - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) From 86062a5b70b760e0a736cb0b036fa08d450d0eb8 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Tue, 19 Mar 2024 15:50:19 +0000 Subject: [PATCH 146/159] Adding functionality with benchmarks/measure_ppl_MC_small.py. The script is supposed to be a quick correctness benchmark. --- benchmarks/measure_ppl_MC_small.py | 171 ++ tests/prompts/wiki.test.raw | 4358 ++++++++++++++++++++++++++++ 2 files changed, 4529 insertions(+) create mode 100755 benchmarks/measure_ppl_MC_small.py create mode 100644 tests/prompts/wiki.test.raw diff --git a/benchmarks/measure_ppl_MC_small.py b/benchmarks/measure_ppl_MC_small.py new file mode 100755 index 0000000000000..2b344058d056b --- /dev/null +++ b/benchmarks/measure_ppl_MC_small.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 + +# +# This is a quick hack that produces PPL measurement by +# iteratively dumping the logprob vector for the single next symbol +# that is to be generated over the preloaded context. +# It is actually an *inefficient* procedure because for the +# N-token string it takes N*(preload + generation) time instead of +# preload + N*generation +# + +import numpy as np +from transformers import LlamaForCausalLM, LlamaTokenizer +import pandas as pd +import torch +import sys +import datetime +import json +import argparse +from vllm import LLM, SamplingParams +import math +import operator + +def get_wikitext2_text(tokenizer): + with open(args.data) as f: + test_text = "\n".join(line.strip() for line in f) + test_enc = tokenizer(test_text) + + return test_enc, test_text + +def vllm_init(args): + llm = LLM( + model=args.model, + tokenizer=None, + tensor_parallel_size=args.tensor_parallel_size, + trust_remote_code=args.trust_remote_code, + dtype=args.dtype, + kv_cache_dtype=args.kv_cache_dtype, + kv_cache_scales_path=args.kv_cache_scales_path if args.kv_cache_scales_path!='' else None, + max_context_len_to_capture=args.context_size, + ) + + sampling_params = SamplingParams( + n=1, + temperature=0.0, + top_p=1, + use_beam_search=False, + ignore_eos=True, + max_tokens=1, + logprobs=32000, + prompt_logprobs=32000, + presence_penalty=0.0 + ) + + return llm, sampling_params + +def vllm_predict(CONT, llm, sampl_par): + result=llm.generate( prompt_token_ids=CONT,sampling_params=sampl_par) + return result + +def main(args: argparse.Namespace): + + print (f"### Initialising @ {datetime.datetime.now()}") + my_ppl=0.0 + + my_tokenizer = LlamaTokenizer.from_pretrained(args.model) + print("Loaded the tokenizer.") + + print("*** Initializing the engine.") + my_llm, my_sampl_par = vllm_init(args) + print(my_sampl_par) + print("*** Initialized the engine.") + + my_test_enc, my_test_text = get_wikitext2_text(my_tokenizer) + print("Loaded the test data.") + + my_n_samples = (len(my_test_enc['input_ids'])-1)//(args.batch_size*args.context_size) + + print (f"### Starting generation @ {datetime.datetime.now()}") + for c in range(my_n_samples): + CONTEXT = [] + for r in range(args.batch_size): + CONTEXT.append(my_test_enc['input_ids'][(c*args.batch_size+r)*args.context_size:(c*args.batch_size+r+1)*args.context_size]) + + LOGPROBS = vllm_predict(CONTEXT, my_llm, my_sampl_par) + + for r in range(args.batch_size): + start=args.context_size//2 + my_stat_size=args.context_size-start+1 + for pc in range(start,args.context_size): + my_ppl -= LOGPROBS[r].prompt_logprobs[pc][my_test_enc['input_ids'][(c*args.batch_size+r)*args.context_size+pc]] + my_ppl -= LOGPROBS[r].outputs[0].logprobs[0][my_test_enc['input_ids'][(c*args.batch_size+r+1)*args.context_size]] + print(f"Intermediate estimates:\n\tCross-entropy_intermediate={my_ppl/((c+1)*args.batch_size*my_stat_size)}") + + my_ppl/=(my_n_samples*args.batch_size*my_stat_size) + + print (f"### Done @ {datetime.datetime.now()}") + + print(f"PPL={math.exp(my_ppl)}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='Benchmark the latency of processing a single batch of ' + 'requests till completion.') + parser.add_argument('--model', type=str, default='facebook/opt-125m') + parser.add_argument('--data', type=str, default='./wikitext/wikitext-2-v1/test-00000-of-00001.parquet') + parser.add_argument('--context-size', type=int, default=4096) + parser.add_argument('--kv-cache-scales-path', type=str, default='') + parser.add_argument('--num-samples-per-task', type=int, default=1) + parser.add_argument('--temperature', type=float, default=1.0) + parser.add_argument('--experiment-prefix',type=str, default='solution_samples') + parser.add_argument('--tokenizer', type=str, default=None) + parser.add_argument('--quantization', + '-q', + choices=['awq', 'gptq', 'squeezellm', None], + default=None) + parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) + parser.add_argument('--input-len', type=int, default=32) + parser.add_argument('--output-len', type=int, default=128) + parser.add_argument('--batch-size', type=int, default=8) + parser.add_argument('--n', + type=int, + default=1, + help='Number of generated sequences per prompt.') + parser.add_argument('--ppl-measurement', action='store_false') + parser.add_argument('--use-beam-search', action='store_true') + parser.add_argument('--num-iters', + type=int, + default=3, + help='Number of iterations to run.') + parser.add_argument('--trust-remote-code', + action='store_true', + help='trust remote code from huggingface') + parser.add_argument( + '--dtype', + type=str, + default='auto', + choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], + help='data type for model weights and activations. ' + 'The "auto" option will use FP16 precision ' + 'for FP32 and FP16 models, and BF16 precision ' + 'for BF16 models.') + parser.add_argument('--enforce-eager', + action='store_true', + help='enforce eager mode and disable CUDA graph') + parser.add_argument( + "--kv-cache-dtype", + type=str, + choices=['auto', 'fp8_e5m2','fp8'], + default='auto', + help= + 'Data type for kv cache storage. If "auto", will use model data type.') + parser.add_argument( + '--profile', + action='store_true', + help='profile the generation process of a single batch') + parser.add_argument( + '--profile-result-dir', + type=str, + default=None, + help=('path to save the pytorch profiler output. Can be visualized ' + 'with ui.perfetto.dev or Tensorboard.')) + parser.add_argument( + "--device", + type=str, + default="cuda", + choices=["cuda"], + help='device type for vLLM execution, supporting CUDA only currently.') + args = parser.parse_args() + + main(args) diff --git a/tests/prompts/wiki.test.raw b/tests/prompts/wiki.test.raw new file mode 100644 index 0000000000000..d9d7915819350 --- /dev/null +++ b/tests/prompts/wiki.test.raw @@ -0,0 +1,4358 @@ + + = Robert Boulter = + + Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television series Judge John Deed in 2002 . In 2004 Boulter landed a role as " Craig " in the episode " Teddy 's Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the Menier Chocolate Factory in London . He was directed by John Tiffany and starred alongside Ben Whishaw , Shane Zaza , Harry Kent , Fraser Ayres , Sophie Stanton and Dominic Hall . + In 2006 , Boulter starred alongside Whishaw in the play Citizenship written by Mark Ravenhill . He appeared on a 2006 episode of the television series , Doctors , followed by a role in the 2007 theatre production of How to Curse directed by Josie Rourke . How to Curse was performed at Bush Theatre in the London Borough of Hammersmith and Fulham . Boulter starred in two films in 2008 , Daylight Robbery by filmmaker Paris Leonti , and Donkey Punch directed by Olly Blackburn . In May 2008 , Boulter made a guest appearance on a two @-@ part episode arc of the television series Waking the Dead , followed by an appearance on the television series Survivors in November 2008 . He had a recurring role in ten episodes of the television series Casualty in 2010 , as " Kieron Fletcher " . Boulter starred in the 2011 film Mercenaries directed by Paris Leonti . + + = = Career = = + + + = = = 2000 – 2005 = = = + + In 2000 Boulter had a guest @-@ starring role on the television series The Bill ; he portrayed " Scott Parry " in the episode , " In Safe Hands " . Boulter starred as " Scott " in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . A review of Boulter 's performance in The Independent on Sunday described him as " horribly menacing " in the role , and he received critical reviews in The Herald , and Evening Standard . He appeared in the television series Judge John Deed in 2002 as " Addem Armitage " in the episode " Political Expediency " , and had a role as a different character " Toby Steele " on The Bill . + He had a recurring role in 2003 on two episodes of The Bill , as character " Connor Price " . In 2004 Boulter landed a role as " Craig " in the episode " Teddy 's Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . Boulter starred as " Darren " , in the 2005 theatre productions of the Philip Ridley play Mercury Fur . It was performed at the Drum Theatre in Plymouth , and the Menier Chocolate Factory in London . He was directed by John Tiffany and starred alongside Ben Whishaw , Shane Zaza , Harry Kent , Fraser Ayres , Sophie Stanton and Dominic Hall . Boulter received a favorable review in The Daily Telegraph : " The acting is shatteringly intense , with wired performances from Ben Whishaw ( now unrecognisable from his performance as Trevor Nunn 's Hamlet ) , Robert Boulter , Shane Zaza and Fraser Ayres . " The Guardian noted , " Ben Whishaw and Robert Boulter offer tenderness amid the savagery . " + + = = = 2006 – present = = = + + In 2006 Boulter starred in the play Citizenship written by Mark Ravenhill . The play was part of a series which featured different playwrights , titled Burn / Chatroom / Citizenship . In a 2006 interview , fellow actor Ben Whishaw identified Boulter as one of his favorite co @-@ stars : " I loved working with a guy called Robert Boulter , who was in the triple bill of Burn , Chatroom and Citizenship at the National . He played my brother in Mercury Fur . " He portrayed " Jason Tyler " on the 2006 episode of the television series , Doctors , titled " Something I Ate " . Boulter starred as " William " in the 2007 production of How to Curse directed by Josie Rourke . How to Curse was performed at Bush Theatre in the London Borough of Hammersmith and Fulham . In a review of the production for The Daily Telegraph , theatre critic Charles Spencer noted , " Robert Boulter brings a touching vulnerability to the stage as William . " + Boulter starred in two films in 2008 , Daylight Robbery by filmmaker Paris Leonti , and Donkey Punch directed by Olly Blackburn . Boulter portrayed a character named " Sean " in Donkey Punch , who tags along with character " Josh " as the " quiet brother ... who hits it off with Tammi " . Boulter guest starred on a two @-@ part episode arc " Wounds " in May 2008 of the television series Waking the Dead as character " Jimmy Dearden " . He appeared on the television series Survivors as " Neil " in November 2008 . He had a recurring role in ten episodes of the television series Casualty in 2010 , as " Kieron Fletcher " . He portrayed an emergency physician applying for a medical fellowship . He commented on the inherent difficulties in portraying a physician on television : " Playing a doctor is a strange experience . Pretending you know what you 're talking about when you don 't is very bizarre but there are advisers on set who are fantastic at taking you through procedures and giving you the confidence to stand there and look like you know what you 're doing . " Boulter starred in the 2011 film Mercenaries directed by Paris Leonti . + + = = Filmography = = + + + = = = Film = = = + + + = = = Television = = = + + + = = = Theatre = = = + + + + = Du Fu = + + Du Fu ( Wade – Giles : Tu Fu ; Chinese : 杜甫 ; 712 – 770 ) was a prominent Chinese poet of the Tang dynasty . Along with Li Bai ( Li Po ) , he is frequently called the greatest of the Chinese poets . His greatest ambition was to serve his country as a successful civil servant , but he proved unable to make the necessary accommodations . His life , like the whole country , was devastated by the An Lushan Rebellion of 755 , and his last 15 years were a time of almost constant unrest . + Although initially he was little @-@ known to other writers , his works came to be hugely influential in both Chinese and Japanese literary culture . Of his poetic writing , nearly fifteen hundred poems have been preserved over the ages . He has been called the " Poet @-@ Historian " and the " Poet @-@ Sage " by Chinese critics , while the range of his work has allowed him to be introduced to Western readers as " the Chinese Virgil , Horace , Ovid , Shakespeare , Milton , Burns , Wordsworth , Béranger , Hugo or Baudelaire " . + + = = Life = = + + Traditional Chinese literary criticism emphasized the life of the author when interpreting a work , a practice which Burton Watson attributes to " the close links that traditional Chinese thought posits between art and morality " . Since many of Du Fu 's poems feature morality and history , this practice is particularly important . Another reason , identified by the Chinese historian William Hung , is that Chinese poems are typically concise , omitting context that might be relevant , but which an informed contemporary could be assumed to know . For modern Western readers , " The less accurately we know the time , the place and the circumstances in the background , the more liable we are to imagine it incorrectly , and the result will be that we either misunderstand the poem or fail to understand it altogether " . Stephen Owen suggests a third factor particular to Du Fu , arguing that the variety of the poet 's work required consideration of his whole life , rather than the " reductive " categorizations used for more limited poets . + + = = = Early years = = = + + Most of what is known of Du Fu 's life comes from his poems . His paternal grandfather was Du Shenyan , a noted politician and poet during the reign of Empress Wu . Du Fu was born in 712 ; the exact birthplace is unknown , except that it was near Luoyang , Henan province ( Gong county is a favourite candidate ) . In later life , he considered himself to belong to the capital city of Chang 'an , ancestral hometown of the Du family . + Du Fu 's mother died shortly after he was born , and he was partially raised by his aunt . He had an elder brother , who died young . He also had three half brothers and one half sister , to whom he frequently refers in his poems , although he never mentions his stepmother . + The son of a minor scholar @-@ official , his youth was spent on the standard education of a future civil servant : study and memorisation of the Confucian classics of philosophy , history and poetry . He later claimed to have produced creditable poems by his early teens , but these have been lost . + In the early 730s , he travelled in the Jiangsu / Zhejiang area ; his earliest surviving poem , describing a poetry contest , is thought to date from the end of this period , around 735 . In that year , he took the civil service exam , likely in Chang 'an . He failed , to his surprise and that of centuries of later critics . Hung concludes that he probably failed because his prose style at the time was too dense and obscure , while Chou suggests his failure to cultivate connections in the capital may have been to blame . After this failure , he went back to traveling , this time around Shandong and Hebei . + His father died around 740 . Du Fu would have been allowed to enter the civil service because of his father 's rank , but he is thought to have given up the privilege in favour of one of his half brothers . He spent the next four years living in the Luoyang area , fulfilling his duties in domestic affairs . + In the autumn of 744 , he met Li Bai ( Li Po ) for the first time , and the two poets formed a friendship . David Young describes this as " the most significant formative element in Du Fu 's artistic development " because it gave him a living example of the reclusive poet @-@ scholar life to which he was attracted after his failure in the civil service exam . The relationship was somewhat one @-@ sided , however . Du Fu was by some years the younger , while Li Bai was already a poetic star . We have twelve poems to or about Li Bai from the younger poet , but only one in the other direction . They met again only once , in 745 . + In 746 , he moved to the capital in an attempt to resurrect his official career . He took the civil service exam a second time during the following year , but all the candidates were failed by the prime minister ( apparently in order to prevent the emergence of possible rivals ) . He never again attempted the examinations , instead petitioning the emperor directly in 751 , 754 and probably again in 755 . He married around 752 , and by 757 the couple had had five children — three sons and two daughters — but one of the sons died in infancy in 755 . From 754 he began to have lung problems ( probably asthma ) , the first of a series of ailments which dogged him for the rest of his life . It was in that year that Du Fu was forced to move his family due to the turmoil of a famine brought about by massive floods in the region . + In 755 , he received an appointment as Registrar of the Right Commandant 's office of the Crown Prince 's Palace . Although this was a minor post , in normal times it would have been at least the start of an official career . Even before he had begun work , however , the position was swept away by events . + + = = = War = = = + + The An Lushan Rebellion began in December 755 , and was not completely suppressed for almost eight years . It caused enormous disruption to Chinese society : the census of 754 recorded 52 @.@ 9 million people , but ten years later , the census counted just 16 @.@ 9 million , the remainder having been displaced or killed . During this time , Du Fu led a largely itinerant life unsettled by wars , associated famines and imperial displeasure . This period of unhappiness was the making of Du Fu as a poet : Even Shan Chou has written that , " What he saw around him — the lives of his family , neighbors , and strangers – what he heard , and what he hoped for or feared from the progress of various campaigns — these became the enduring themes of his poetry " . Even when he learned of the death of his youngest child , he turned to the suffering of others in his poetry instead of dwelling upon his own misfortunes . Du Fu wrote : + Brooding on what I have lived through , if even I know such suffering , the common man must surely be rattled by the winds . + In 756 , Emperor Xuanzong was forced to flee the capital and abdicate . Du Fu , who had been away from the city , took his family to a place of safety and attempted to join the court of the new emperor ( Suzong ) , but he was captured by the rebels and taken to Chang 'an . In the autumn , his youngest son , Du Zongwu ( Baby Bear ) , was born . Around this time Du Fu is thought to have contracted malaria . + He escaped from Chang 'an the following year , and was appointed Reminder when he rejoined the court in May 757 . This post gave access to the emperor but was largely ceremonial . Du Fu 's conscientiousness compelled him to try to make use of it : he caused trouble for himself by protesting the removal of his friend and patron Fang Guan on a petty charge . He was arrested but was pardoned in June . He was granted leave to visit his family in September , but he soon rejoined the court and on December 8 , 757 , he returned to Chang 'an with the emperor following its recapture by government forces . However , his advice continued to be unappreciated , and in the summer of 758 he was demoted to a post as Commissioner of Education in Huazhou . The position was not to his taste : in one poem , he wrote : + I am about to scream madly in the office / Especially when they bring more papers to pile higher on my desk . + He moved on in the summer of 759 ; this has traditionally been ascribed to famine , but Hung believes that frustration is a more likely reason . He next spent around six weeks in Qinzhou ( now Tianshui , Gansu province ) , where he wrote more than sixty poems . + + = = = Chengdu = = = + + In December 759 , he briefly stayed in Tonggu ( modern Gansu ) . He departed on December 24 for Chengdu ( Sichuan province ) , where he was hosted by local Prefect and fellow poet Pei Di . Du subsequently based himself in Sichuan for most of the next five years . By the autumn of that year he was in financial trouble , and sent poems begging help to various acquaintances . He was relieved by Yan Wu , a friend and former colleague who was appointed governor general at Chengdu . Despite his financial problems , this was one of the happiest and most peaceful periods of his life . Many of Du 's poems from this period are peaceful depictions of his life at " thatched hut " . In 762 , he left the city to escape a rebellion , but he returned in summer 764 when he was appointed an advisor to Yan , who was involved in campaigns against the Tibetan Empire . + + = = = Last years = = = + + Luoyang , the region of his birthplace , was recovered by government forces in the winter of 762 , and in the spring of 765 Du Fu and his family sailed down the Yangtze , apparently with the intention of making their way there . They traveled slowly , held up by his ill @-@ health ( by this time he was suffering from poor eyesight , deafness and general old age in addition to his previous ailments ) . They stayed in Kuizhou ( in what is now Baidicheng , Chongqing ) at the entrance to the Three Gorges for almost two years from late spring 766 . This period was Du Fu 's last great poetic flowering , and here he wrote 400 poems in his dense , late style . In autumn 766 , Bo Maolin became governor of the region : he supported Du Fu financially and employed him as his unofficial secretary . + In March 768 , he began his journey again and got as far as Hunan province , where he died in Tanzhou ( now Changsha ) in November or December 770 , in his 58th year . He was survived by his wife and two sons , who remained in the area for some years at least . His last known descendant is a grandson who requested a grave inscription for the poet from Yuan Zhen in 813 . + Hung summarises his life by concluding that , " He appeared to be a filial son , an affectionate father , a generous brother , a faithful husband , a loyal friend , a dutiful official , and a patriotic subject . " + Below is an example of one of Du Fu 's later works , To My Retired Friend Wei ( Chinese : 贈衛八處士 ) . Like many other poems in the Tang it featured the theme of a long parting between friends , which was often due to officials being frequently transferred to the provinces : + + = = Works = = + + Criticism of Du Fu 's works has focused on his strong sense of history , his moral engagement , and his technical excellence . + + = = = History = = = + + Since the Song dynasty , critics have called Du Fu the " poet historian " ( 詩史 shī shǐ ) . The most directly historical of his poems are those commenting on military tactics or the successes and failures of the government , or the poems of advice which he wrote to the emperor . Indirectly , he wrote about the effect of the times in which he lived on himself , and on the ordinary people of China . As Watson notes , this is information " of a kind seldom found in the officially compiled histories of the era " . + Du Fu 's political comments are based on emotion rather than calculation : his prescriptions have been paraphrased as , " Let us all be less selfish , let us all do what we are supposed to do " . Since his views were impossible to disagree with , his forcefully expressed truisms enabled his installation as the central figure of Chinese poetic history . + + = = = Moral engagement = = = + + A second favourite epithet of Chinese critics is that of " poet sage " ( 詩聖 shī shèng ) , a counterpart to the philosophical sage , Confucius . One of the earliest surviving works , The Song of the Wagons ( from around 750 ) , gives voice to the sufferings of a conscript soldier in the imperial army and a clear @-@ sighted consciousness of suffering . These concerns are continuously articulated in poems on the lives of both soldiers and civilians produced by Du Fu throughout his life . + Although Du Fu 's frequent references to his own difficulties can give the impression of an all @-@ consuming solipsism , Hawkes argues that his " famous compassion in fact includes himself , viewed quite objectively and almost as an afterthought " . He therefore " lends grandeur " to the wider picture by comparing it to " his own slightly comical triviality " . + Du Fu 's compassion , for himself and for others , was part of his general broadening of the scope of poetry : he devoted many works to topics which had previously been considered unsuitable for poetic treatment . Zhang Jie wrote that for Du Fu , " everything in this world is poetry " , Du wrote extensively on subjects such as domestic life , calligraphy , paintings , animals , and other poems . + + = = = Technical excellence = = = + + Du Fu 's work is notable above all for its range . Chinese critics traditionally used the term 集大成 ( jídàchéng- " complete symphony " ) , a reference to Mencius ' description of Confucius . Yuan Zhen was the first to note the breadth of Du Fu 's achievement , writing in 813 that his predecessor , " united in his work traits which previous men had displayed only singly " . He mastered all the forms of Chinese poetry : Chou says that in every form he " either made outstanding advances or contributed outstanding examples " . Furthermore , his poems use a wide range of registers , from the direct and colloquial to the allusive and self @-@ consciously literary . This variety is manifested even within individual works : Owen identifies the , " rapid stylistic and thematic shifts " in poems which enable the poet to represent different facets of a situation , while Chou uses the term " juxtaposition " as the major analytical tool in her work . Du Fu is noted for having written more on poetics and painting than any other writer of his time . He wrote eighteen poems on painting alone , more than any other Tang poet . Du Fu 's seemingly negative commentary on the prized horse paintings of Han Gan ignited a controversy that has persisted to the present day . + The tenor of his work changed as he developed his style and adapted to his surroundings ( " chameleon @-@ like " according to Watson ) : his earliest works are in a relatively derivative , courtly style , but he came into his own in the years of the rebellion . Owen comments on the " grim simplicity " of the Qinzhou poems , which mirrors the desert landscape ; the works from his Chengdu period are " light , often finely observed " ; while the poems from the late Kuizhou period have a " density and power of vision " . + Although he wrote in all poetic forms , Du Fu is best known for his lǜshi , a type of poem with strict constraints on form and content , for example : + About two thirds of Du Fu 's 1500 extant works are in this form , and he is generally considered to be its leading exponent . His best lǜshi use the parallelisms required by the form to add expressive content rather than as mere technical restrictions . Hawkes comments that , " it is amazing that Tu Fu is able to use so immensely stylized a form in so natural a manner " . + + = = Influence = = + + According to the Encyclopædia Britannica , Du Fu 's writings are considered by many literary critics to be among the greatest of all time , and it states " his dense , compressed language makes use of all the connotative overtones of a phrase and of all the intonational potentials of the individual word , qualities that no translation can ever reveal . " + In his lifetime and immediately following his death , Du Fu was not greatly appreciated . In part this can be attributed to his stylistic and formal innovations , some of which are still " considered extremely daring and bizarre by Chinese critics . " There are few contemporary references to him — only eleven poems from six writers — and these describe him in terms of affection , but not as a paragon of poetic or moral ideals . Du Fu is also poorly represented in contemporary anthologies of poetry . + However , as Hung notes , he " is the only Chinese poet whose influence grew with time " , and his works began to increase in popularity in the ninth century . Early positive comments came from Bai Juyi , who praised the moral sentiments of some of Du Fu 's works ( although he found these in only a small fraction of the poems ) , and from Han Yu , who wrote a piece defending Du Fu and Li Bai on aesthetic grounds from attacks made against them . Both these writers showed the influence of Du Fu in their own poetic work . By the beginning of the 10th century , Wei Zhuang constructed the first replica of his thatched cottage in Sichuan . + It was in the 11th century , during the Northern Song era that Du Fu 's reputation reached its peak . In this period a comprehensive re @-@ evaluation of earlier poets took place , in which Wang Wei , Li Bai and Du Fu came to be regarded as representing respectively the Buddhist , Daoist and Confucian strands of Chinese culture . At the same time , the development of Neo @-@ Confucianism ensured that Du Fu , as its poetic exemplar , occupied the paramount position . Su Shi famously expressed this reasoning when he wrote that Du Fu was " preeminent ... because ... through all his vicissitudes , he never for the space of a meal forgot his sovereign " . His influence was helped by his ability to reconcile apparent opposites : political conservatives were attracted by his loyalty to the established order , while political radicals embraced his concern for the poor . Literary conservatives could look to his technical mastery , while literary radicals were inspired by his innovations . Since the establishment of the People 's Republic of China , Du Fu 's loyalty to the state and concern for the poor have been interpreted as embryonic nationalism and socialism , and he has been praised for his use of simple , " people 's language " . + Du Fu 's popularity grew to such an extent that it is as hard to measure his influence as that of Shakespeare in England : it was hard for any Chinese poet not to be influenced by him . While there was never another Du Fu , individual poets followed in the traditions of specific aspects of his work : Bai Juyi 's concern for the poor , Lu You 's patriotism , and Mei Yaochen 's reflections on the quotidian are a few examples . More broadly , Du Fu 's work in transforming the lǜshi from mere word play into " a vehicle for serious poetic utterance " set the stage for every subsequent writer in the genre . + In the 20th century , he was the favourite poet of Kenneth Rexroth , who has described him as " the greatest non @-@ epic , non @-@ dramatic poet who has survived in any language " , and commented that , " he has made me a better man , as a moral agent and as a perceiving organism " . + + = = = Influence on Japanese literature = = = + + Du Fu 's poetry has made a profound impact on Japanese literature , especially on the literature from the Muromachi period and on scholars and poets in the Edo period , including Matsuo Bashō , the very greatest of all haiku poets . Even in modern Japanese , the term Saint of Poetry ( 詩聖 , shisei ) is mostly synonymous with Du Fu . + Until the 13th century , the Japanese preferred Bai Juyi above all poets and there were few references to Du Fu , although his influence can be seen in some kanshi ( " Chinese poetry made by Japanese poets " ) anthologies such as Bunka Shūreishū in the 9th century . The first notable Japanese appreciator of Du Fu 's poetry was Kokan Shiren ( 1278 – 1346 ) , a Rinzai Zen patriarch and one of the most prominent authors of the literature of the Five Mountains ; he highly praised Du Fu and made a commentary on some poems of Du Fu from the perspective of a Zen priest in Vol . 11 of Saihokushū . His student Chūgan Engetsu composed many kanshi which were clearly stated " influenced by Du Fu " in their prefaces . Chūgan 's student Gidō Shūshin had close connection with the Court and Ashikaga Shogunate and propagated Du Fu 's poetry in the mundane world ; one day Nijō Yoshimoto , the Kampaku regent of the Court and the highest authority of renga poetry , asked Gidō , " Should I learn the poetry of Du Fu and Li Bai ? " Gidō dared to reply , " Yes if you do have enough capability . No if do not . " Since then , there had been many seminars on Du Fu 's poetry both in Zen temples and in the aristocratic society , and as a result his poetry was often cited in Japanese literature in the Muromachi period , e.g. , Taiheiki , a historical epic in the late 14th century , and some noh plays such as Hyakuman , Bashō , and Shunkan . + During the Kan 'ei era of the Edo period ( 1624 – 1643 ) , Shào Chuán ( 邵傳 ) of the Ming Dynasty 's Collective Commentary on Du Fu 's Lǜshi ( 杜律集解 , Toritsu Shikkai ) was imported into Japan , and it gained explosive popularity in Confucian scholars and chōnin ( townspeople ) class . The commentary established Du Fu 's fame as the highest of all poets ; for instance , Hayashi Shunsai , a notable Confucian scholar , commented in Vol . 37 of Gahō Bunshū that Zǐměi [ Du Fu ] was the very best poet in history and praised Shào Chuán 's commentary for its simplicity and readability , while he criticized old commentaries during the Yuan Dynasty were too unfathomable . Matsuo Bashō , the greatest haiku poet , was also strongly influenced by Du Fu ; in Oku no Hosomichi , his masterpiece , he cites the first two lines of A Spring View ( 春望 ) before a haiku as its introduction and also many of his other haiku have similar wording and themes . It is said that when he died in Osaka during a long travel , a copy of Du Fu 's poetry was found with him as one of a few precious items which he was able to carry around . + + = = Translation = = + + A variety of styles have been used in efforts to translate Du Fu 's work into English . As Burton Watson remarks in The Selected Poems of Du Fu , " There are many different ways to approach the problems involved in translating Du Fu , which is why we need as many different translations as possible " ( p. xxii ) . The translators have had to contend with bringing out the formal constraints of the original without sounding laboured to a Western ear ( particularly when translating regulated verse , or lǜshi ) , and accommodating the complex allusions contained particularly in the later works ( Hawkes writes that " his poems do not as a rule come through very well in translation " — p. ix ) . One extreme on each issue is represented by Kenneth Rexroth 's One Hundred Poems From the Chinese . His are free translations , which seek to conceal the parallelisms through enjambement and expansion and contraction of the content ; his responses to the allusions are firstly to omit most of these poems from his selection , and secondly to " translate out " the references in those works which he does select . + Other translators have placed much greater weight on trying to convey a sense of the poetic forms used by Du Fu . Vikram Seth in Three Chinese Poets uses English @-@ style rhyme schemes , whereas Keith Holyoak in Facing the Moon approximates the Chinese rhyme scheme ; both use end @-@ stopped lines and preserve some degree of parallelism . In The Selected Poems of Du Fu , Burton Watson follows the parallelisms quite strictly , persuading the western reader to adapt to the poems rather than vice versa . Similarly , he deals with the allusion of the later works by combining literal translation with extensive annotation . + In 2015 , Stephen Owen published translations , with facing Chinese texts , of the complete poetry of Du Fu in six volumes , with extensive scholarly apparatus , which emphasized literalness . + + + = Kiss You ( One Direction song ) = + + " Kiss You " is a song recorded by English @-@ Irish boy band One Direction for their second studio album , Take Me Home ( 2012 ) . It was released as the record 's second single in Germany and the third overall single on 7 January 2013 . The song was composed by Kristoffer Fogelmark , Kristian Lundin , Albin Nedler , Savan Kotecha , Shellback and its producers , Carl Falk and Rami Yacoub . " Kiss You " is an upbeat power pop song with electronic effects ; the lyrics detail a protagonist 's infatuation with a significant other . Critics praised the song for its production , calling it a stand @-@ out track on Take Me Home . + The track became the group 's sixth top @-@ ten hit in Ireland and the United Kingdom , while attaining top @-@ forty positions in both Belgian territories ( Flanders and Wallonia ) , as well as in Australia , Canada , Denmark , France , New Zealand , and the Netherlands . The single peaked at number 46 on the US Billboard Hot 100 and has been certified gold by the Recording Industry Association of America ( RIAA ) for shipments of 500 @,@ 000 copies . One Direction performed " Kiss You " on both the UK and US versions of The X Factor and 3 major concert tours : Take Me Home Tour ( 2013 ) , Where We Are Tour ( 2014 ) and On the Road Again Tour ( 2015 ) . + An accompanying music video , designed to display the group 's comedic timing , was directed by Vaughan Arnell , who had previously worked with the group on two other music videos . The clip depicts the band shooting various scenes via a green screen , which include sequences reminiscent of iconic music videos of songs such as the Beach Boys ' " Surfer Girl " , Elvis Presley 's " Jailhouse Rock " and Rammstein 's " Mein Land " . The music video received 10 @.@ 4 million views in a 24 @-@ hour period and positive commentary from reviewers , who appreciated its carefree , jubilant nature . + The song was included in the dancing game Just Dance 2014 , and is also one of the select songs available on the demo version . Additionally , it is the final main track on the US edition of Now That 's What I Call Music ! 46 . + + = = Background and release = = + + " Kiss You " was written by Kristoffer Fogelmark , Kristian Lundin , Albin Nedler , Savan Kotecha , Shellback , and its producers , Carl Falk and Rami Yacoub . Falk , Kotecha , and Yacoub had collaboratively composed One Direction 's previous hit singles , " What Makes You Beautiful " , " One Thing " , and " Live While We 're Young " . In April 2012 , The Independent reported that Simon Cowell , the group 's manager , had challenged prominent songwriters to compete for space on One Direction 's second album . Falk said , " It 's important to get their personalities on the music . " In addition , the article reported that Syco Records was working on candidates that included Max Martin and Lundin . + " Kiss You " was chosen as the second US single and third international from their second studio album , Take Me Home . Liam Payne , a group member , in a November 2012 interview with MTV News , explained why they chose " Kiss You " as the album 's second single in the US . Payne was quoted as saying : " With the album , that 's the first one that we listened to and we were like , ' Yeah , we love this song ' " . According to a MTV News article , the number was released digitally in the United States on 17 November 2012 . By 18 January 2013 , the song had not been officially promoted to US radio stations . The track , however , was released by Sony Music Entertainment on 8 February 2013 , as the record 's second single in Germany . + + = = Composition and reception = = + + " Kiss You " is an uptempo , upbeat power pop song which runs for a duration of 3 : 04 ( 3 minutes , four seconds ) . The track features electronic effects , colossal hooks , a " na na na " breakdown , and a Motown @-@ tinged melody . One Direction 's vocal range in the song span from the note of E4 to C ♯ 6 . Instrumentation includes guitar strings , piano lines and vocals . Written in the key of E major , the beat is set in common time and moves at a quick 90 beats per minute , according to the digital sheet music published at Musicnotes.com by Sony / ATV Music Publishing . Likewise , Matt Collar from Allmusic noted that the track is " frenetically hyper " . The lyrical content regards the protagonist 's infatuation with a significant other , and incorporates euphemisms for sexual intercourse in the lines " If you don ’ t wanna take it slow / And you just wanna take me home / Baby say yeah , yeah , yeah , yeah , yeah . " + " Kiss You " was well received by contemporary music critics , who centred on its quality of production . Both Rolling Stone 's Jon Dolan , who praised its effectiveness , and Chris Payne of Billboard , who appreciated the melody , described " Kiss You " as one of the album 's highlights . Alexis Petridis for The Guardian commended the track 's chorus as " hard to dislodge from your brain " . Robert Copsey of Digital Spy noted the song 's possibility to become an international hit , applauding it sonically . A reviewer for MTV News described the track 's lyricism as " butterflies @-@ inducing " , and Sam Lansky of Idolator wrote that " Kiss You " is noticeably a stand @-@ out track on its parent album . Melinda Newman , writing for HitFix , regarded the song as " a bouncy , electronic infectious ditty , " while Chris Younie , a critic from 4Music , deemed it an " amazing pop song " , lauding the group 's falsetto and its " head @-@ banging anthemic " chorus . + + = = Commercial performance = = + + The single made its Irish Singles Chart debut at number 24 on the week ending 13 December 2012 . It peaked at number seven on the week ending 17 January 2013 , marking their sixth top ten appearance in Ireland . " Kiss You " entered at number 152 in the UK Singles Chart on 24 November 2012 . It peaked at number nine on the UK Singles Chart on 26 January 2013 , becoming One Direction 's sixth top ten hit in the United Kingdom . On the week ending 18 November 2012 , " Kiss You " debuted at number 90 on the United States Billboard Hot 100 due to digital download sales from its parent album . As a result of an " end @-@ of @-@ year download rush " on the week ending 30 December 2012 , the track re @-@ entered the Hot 100 at number 83 . After the accompanying music video was released , the song re @-@ entered the Hot 100 at number 65 . " Kiss You " had sold 207 @,@ 000 digital downloads in the US by 18 January 2013 . The single ultimately peaked at number 46 on the Hot 100 and was certified gold by the Recording Industry Association of America ( RIAA ) on 25 April 2013 , denoting shipments of 500 @,@ 000 copies . + The song became One Direction 's fourth top @-@ forty hit on the Canadian Hot 100 , peaking at number 30 . The single bowed at number 13 on the Australian Singles Chart on 27 January 2013 , marking its peak position and the group 's fourth top twenty hit in Australia . The song has been certified platinum by the Australian Recording Industry Association ( ARIA ) for shipments of 70 @,@ 000 copies . The track entered the New Zealand Singles Chart at number 17 on 11 January 2013 . It peaked at number 13 in its third and fourth charting weeks , becominh the group 's sixth top @-@ forty appearance in New Zealand . " Kiss You " has received a gold certification from the Recording Industry Association of New Zealand ( RIANZ ) , indicating sales of 7 @,@ 500 copies . The track also reached the top 40 in both Belgian territories ( Flanders and Wallonia ) , as well as in the Czech Republic , Denmark , France , the Netherlands , and South Korea . In addition , " Kiss You " received gold certifications from the IFPI Norway and Denmark associations , signifying collective shipments of 20 @,@ 000 units . + + = = Music video = = + + The accompanying music video , directed by Vaughan Arnell , who had previously directed One Direction 's music videos for " Live While We 're Young " and " Little Things " , was designed to showcase the group 's comedic timing . Inspired by the Beach Boys , cult surfing films , old Hollywood , and British cinema , the music video incorporates " a technicolor vibe and a British kind of romp " , as noted by Arnell in a MTV News interview . + Shot by November 2012 , the music video was characterised , in several MTV News interviews , as " bigger than anything we 've done before " by Zayn Malik , as " a lot of hard work " by Payne , as " pure stupidity " by Louis Tomlinson , and as " I wouldn 't say [ it 's ] comedy , it 's all tongue @-@ in @-@ cheek " by Arnell . Premiering worldwide on Vevo on 7 January 2013 , the music video depicts the band shooting different scenes via a green screen , dressed as sailors , surfers , skiers and jailers . The video features scenes reminiscent of the films South Pacific , To Catch a Thief , Jailhouse Rock and Beach Blanket Bingo , as well as the iconic music videos of songs such as The Beach Boys ' " Surfer Girl " , Elvis Presley 's " Blue Hawaii " and Rammstein 's " Mein Land " , among others . + The music video garnered 10 @.@ 4 million views in a 24 @-@ hour period , failing to attain the Vevo record held by Justin Bieber 's " Beauty and a Beat " music video ( 10 @.@ 6 million ) . Despite a 34 % gain in weekly activity to their Vevo channel , with the clip 's success and preceding teaser videos earning 38 million views during the week , One Direction held at number two on the Billboard 's Social 50 chart A 15 % rise in Facebook reaction gave way to a 154 @,@ 000 increase in Facebook likes during the week . 191 @,@ 000 Twitter followers added contributed to their overall fan base increase as well . + Melinda Newman , a contributor for HitFix , favoured the clip as having " everything a video by a boy band should be " and found group 's careless tone delightful . Rebecca Macatee of E ! Online praised its " intentionally cheesy and utterly adorable " sequences , and MTV News 's Jocelyn Vena described the clip as " conquering old Hollywood " . Molly Chance , writing for Zap2it , was convinced that upon watching the " adorable " music video , the viewer should have a hard time disliking the group . Mikael Wood , the critic for Los Angeles Times , commended the group for " having a genuinely great time " , rather than going through the motions . + + = = Live performances = = + + As part of its promotion , One Direction performed the song on televised programmes and during their worldwide Take Me Home Tour ( 2013 ) . One Direction performed the track on The Today Show at the Rockefeller Center on 13 November 2012 , to a record crowd estimated at 15 @,@ 000 . " Kiss You " was included in the set list of the group 's 3 December 2012 sold @-@ out show at New York City 's Madison Square Garden . One Direction delivered a performance of " Kiss You " , in front of a video game @-@ themed set , on the final of the ninth series of The X Factor UK on 10 December 2012 . According to the Daily Mail , their " energetic rendition " of " Kiss You " proved that the group have an elusive quality . On 12 December 2012 , the group also performed the number on the final of the second season of The X Factor USA . Considering One Direction the " franchise 's biggest success story " , an editor for The Huffington Post opined that the boy band 's prominent presence on both the US and UK versions of The X Factor seemed fitting . Not only Take Me Home Tour , they also performance in Where We Are Tour ( 2014 ) & On the Road Again Tour ( 2015 ) + + = = Track listing = = + + CD single + " Kiss You " – 3 : 04 + " Little Things " – 3 : 42 + + = = Credits and personnel = = + + Carl Falk — writing , production , programming , instruments , guitar , background vocals + Kristoffer Fogelmark — background vocals + Niall Horan — additional guitar + Savan Kotecha — writing , background vocals + Kristian Lundin — writing + Albin Nedler — writing , background vocals + Shellback — writing + Rami Yacoub — writing , production , programming , instruments , bass + Credits adapted from Take Me Home 's liner notes . + + = = Charts = = + + + = = Certifications = = + + + = = Release history = = + + + + = Ise @-@ class battleship = + + The Ise @-@ class battleships ( 伊勢型戦艦 , Ise @-@ gata senkan ) were a pair of dreadnought battleships built for the Imperial Japanese Navy ( IJN ) during World War I. Originally intended to be repeats of the preceding Fusō class , they were redesigned before construction began . Both ships carried supplies for the survivors of the Great Kantō earthquake in 1923 . They were modernized in 1934 – 37 with improvements to their armour and machinery and a rebuilt superstructure in the pagoda mast style . Afterwards they played a minor role in the Second Sino @-@ Japanese War . + Despite the expensive reconstructions , both vessels were considered obsolete by the eve of the Pacific War , and neither saw significant action in the early years of the war . Following the loss of most of the IJN 's large aircraft carriers during the Battle of Midway in mid @-@ 1942 , they were rebuilt with a flight deck replacing the rear pair of gun turrets to give them the ability to operate an air group of floatplanes . A lack of aircraft and qualified pilots , however , meant that they never actually operated their aircraft in combat . While awaiting their air group the sister ships were sometimes used to ferry troops and material to Japanese bases . They participated in the Battle of Cape Engaño in late 1944 , where they decoyed the American carrier fleet supporting the invasion of Leyte away from the landing beaches . Afterwards both ships were transferred to Southeast Asia ; in early 1945 they participated in Operation Kita , where they transported petrol and other strategic materials to Japan . The sisters were then reduced to reserve until they were sunk during American airstrikes in July . After the war they were scrapped in 1946 – 47 . + + = = Background = = + + The design of the Fusō @-@ class battleships was shaped both by the ongoing international naval arms race and a desire among Japanese naval planners to maintain a fleet of capital ships powerful enough to defeat the United States Navy in an encounter in Japanese territorial waters . The IJN 's fleet of battleships had proven highly successful in 1905 , the last year of the Russo @-@ Japanese War , which culminated in the destruction of the Russian Second and Third Pacific Squadrons at the Battle of Tsushima . + In the aftermath , the Japanese Empire immediately turned its focus to the two remaining rivals for imperial dominance in the Pacific Ocean : Britain and the United States . Satō Tetsutarō , a Japanese Navy admiral and military theorist , speculated that conflict would inevitably arise between Japan and at least one of its two main rivals . To that end , he called for the Japanese Navy to maintain a fleet with at least 70 % as many capital ships as the US Navy . This ratio , Satō theorized , would enable the Imperial Japanese Navy to defeat the US Navy in one major battle in Japanese waters in any eventual conflict . Accordingly , the 1907 Imperial Defence Policy called for the construction of a battle fleet of eight modern battleships , 20 @,@ 000 long tons ( 20 @,@ 321 t ) each , and eight modern armoured cruisers , 18 @,@ 000 long tons ( 18 @,@ 289 t ) each . This was the genesis of the Eight @-@ Eight Fleet Program , the development of a cohesive battle line of sixteen capital ships . + The launch of HMS Dreadnought in 1906 by the Royal Navy raised the stakes , and complicated Japan 's plans . Displacing 17 @,@ 900 long tons ( 18 @,@ 200 t ) and armed with ten 12 @-@ inch ( 30 @.@ 5 cm ) guns , Dreadnought rendered all existing battleships obsolete by comparison . The launch of the battlecruiser HMS Invincible the following year was a further setback for Japan 's quest for parity . When the two new Satsuma @-@ class battleships and two Tsukuba @-@ class armoured cruisers , launched by 1911 , were outclassed by their British counterparts , the Eight @-@ Eight Fleet Program was restarted . + The first battleships built for the renewed Eight @-@ Eight Fleet Program were the two dreadnoughts of the Kawachi class , ordered in 1907 and laid down in 1908 . In 1910 , the Navy put forward a request to the Diet ( parliament ) to secure funding for the entirety of the program at once . Because of economic constraints , only four battlecruisers and a single battleship of the Fusō class were ultimately approved by the Diet . Three more Fusō @-@ class ships ( Yamashiro , Ise , and Hyūga ) were approved and all three were ordered in April 1913 . While Yamashiro was laid down later that year , the IJN lacked the funding to proceed with the construction of Ise and Hyūga until the Diet authorized additional funding for the ships in July 1914 . + + = = Design and description = = + + The progress of Fusō 's construction , while the IJN waited for the funding to be released and foreign developments , caused the IJN to reassess the Fusō @-@ class design . The distribution of the midships gun turrets was the most obvious flaw as they complicated the protection of the midships magazine and exposed more of the ship to the blast effects of the guns when they fired . Another issue was that Japanese sailors had problems maintaining a high rate of fire with the 45 @.@ 36 @-@ kilogram ( 100 @.@ 0 lb ) shells used in the manually loaded 152 @-@ millimetre ( 6 in ) secondary guns used in the Fusō class and earlier designs . To resolve this issue , the IJN designed a smaller 140 @-@ millimetre ( 5 @.@ 5 in ) gun that offset its lighter shell weight with a higher rate of fire . It also decided that the barbette armour of the earlier ships was too thin and wanted a modest increase in speed to partially counter the higher speeds of the latest foreign ships like the British Queen Elizabeth @-@ class battleships and Russian Borodino @-@ class battlecruisers . For financial reasons more powerful engines could not be ordered so the new design was lengthened slightly and the boiler rooms enlarged to increase speed by 0 @.@ 5 knots ( 0 @.@ 93 km / h ; 0 @.@ 58 mph ) to 23 knots ( 43 km / h ; 26 mph ) . To save weight the forecastle deck was shortened so that the lower midships gun turret was lower than in the Fusō class . This reduced the crew 's accommodations despite a significant increase in the crew 's numbers and naval historian Fukui Shizuo believed that these ships had the worst habitability of any Japanese capital ship . The final design was designated A @-@ 92 by the IJN and differed enough from the A @-@ 64 design of the Fusō class that it was considered a separate class . + The ships had a length of 208 @.@ 18 metres ( 683 ft 0 in ) overall , a beam of 28 @.@ 65 metres ( 94 ft 0 in ) and a draught of 8 @.@ 93 metres ( 29 ft 4 in ) at deep load . They displaced 36 @,@ 500 long tons ( 37 @,@ 100 t ) at deep load , roughly 650 long tons ( 660 t ) more than the preceding class . Their crew consisted of 1 @,@ 360 officers and enlisted men . They had a metacentric height of 1 @.@ 737 metres ( 5 ft 8 @.@ 4 in ) at deep load . + During the ships ' modernization during the 1930s , their forward superstructures were enlarged with multiple platforms added to their tripod foremasts . Both ships were also given torpedo bulges to improve their underwater protection and to compensate for the weight of the additional armour . In addition , their sterns were lengthened by 7 @.@ 62 metres ( 25 @.@ 0 ft ) . These changes increased their overall length to 213 @.@ 8 metres ( 701 ft ) , their beam to 31 @.@ 75 metres ( 104 ft 2 in ) and their draft to 9 @.@ 45 metres ( 31 ft 0 in ) . Their displacement increased over 5 @,@ 000 long tons ( 5 @,@ 100 t ) to 42 @,@ 001 long tons ( 42 @,@ 675 t ) at deep load . The crew now numbered 1 @,@ 376 officers and enlisted men . + + = = = Propulsion = = = + + The Ise @-@ class ships had two sets of direct @-@ drive steam turbines , each of which drove two propeller shafts with 3 @.@ 429 @-@ metre ( 11 ft 3 in ) propellers . The high @-@ pressure turbines drove the wing shafts while the low @-@ pressure turbines drove the inner shafts . The turbines were designed to produce a total of 40 @,@ 000 or 45 @,@ 000 shaft horsepower ( 30 @,@ 000 or 34 @,@ 000 kW ) ( Hyūga and Ise respectively ) , using steam provided by 24 Kampon Ro Gō water @-@ tube boilers at working pressures of 13 – 16 @.@ 9 kg / cm2 ( 1 @,@ 275 – 1 @,@ 657 kPa ; 185 – 240 psi ) . Both ships comfortably exceeded their designed speed of 23 knots ( 43 km / h ; 26 mph ) during their sea trials ; Ise reached 23 @.@ 6 knots ( 43 @.@ 7 km / h ; 27 @.@ 2 mph ) from 56 @,@ 498 shp ( 42 @,@ 131 kW ) and Hyūga exceeded that with 24 knots ( 44 km / h ; 28 mph ) from 63 @,@ 211 shp ( 47 @,@ 136 kW ) . Each of the boilers consumed a mixture of coal and oil and the ships had a stowage capacity of 4 @,@ 607 long tons ( 4 @,@ 681 t ) of coal and 1 @,@ 411 long tons ( 1 @,@ 434 t ) of fuel oil , which gave them a range of 9 @,@ 680 nautical miles ( 17 @,@ 930 km ; 11 @,@ 140 mi ) at a speed of 14 knots ( 26 km / h ; 16 mph ) . Ise and Hyūga had three generators of 150 kilowatts ( 200 hp ) capacity and two 250 @-@ kilowatt ( 340 hp ) turbo generators at 225 volts . + During their 1930s modernization , the boilers on each ship were replaced by eight new Kampon oil @-@ fired boilers , fitted into the former aft boiler room , and the forward funnel was removed . The turbines were replaced by four geared Kampon turbines with a designed output of 80 @,@ 000 shp ( 60 @,@ 000 kW ) intended to increase their speed to 24 @.@ 5 knots ( 45 @.@ 4 km / h ; 28 @.@ 2 mph ) . On her trials , Ise reached a top speed of 25 @.@ 26 knots ( 46 @.@ 78 km / h ; 29 @.@ 07 mph ) from 81 @,@ 050 shp ( 60 @,@ 440 kW ) . The fuel storage of the ships was increased to a total of 5 @,@ 113 long tons ( 5 @,@ 195 t ) of fuel oil that gave them a range of 7 @,@ 870 nautical miles ( 14 @,@ 580 km ; 9 @,@ 060 mi ) at a speed of 16 knots ( 30 km / h ; 18 mph ) . + + = = = Armament = = = + + The twelve 45 @-@ calibre 35 @.@ 6 cm ( 14 @.@ 0 in ) Type 41 guns of the Ise class were mounted in three pairs of twin @-@ gun , superfiring turrets . Numbered one through six from front to rear , each turret weighed 655 long tons ( 666 t ) . The hydraulically powered turrets had an elevation capability of − 5 / + 20 degrees . The guns had a rate of fire of 1 @.@ 5 – 2 rounds per minute and could be loaded at any angle between -3 and + 20 degrees . In 1921 the elevation was increased to + 30 degrees and then to + 43 degrees during their mid @-@ 1930s modernization , except for No. 6 turret as its supporting structure could not be lowered . The recoil mechanism of the guns was also changed from a hydraulic to a pneumatic system , which allowed for a faster firing cycle of the main guns . + By World War II , the guns used Type 91 armour @-@ piercing , capped shells . Each of these shells weighed 673 @.@ 5 kilograms ( 1 @,@ 485 lb ) and was fired at a muzzle velocity of 770 – 775 metres per second ( 2 @,@ 530 – 2 @,@ 540 ft / s ) . They had a maximum range of 25 @,@ 000 metres ( 27 @,@ 000 yd ) at + 20 degrees of elevation and 35 @,@ 450 meters ( 38 @,@ 770 yd ) at + 43 degrees after modernization . Also available was a 625 @-@ kilogram ( 1 @,@ 378 lb ) high @-@ explosive shell that had a muzzle velocity of 805 metres per second ( 2 @,@ 640 ft / s ) . A special Type 3 Sanshikidan incendiary shrapnel shell was developed in the 1930s for anti @-@ aircraft use . + The ships ' secondary armament consisted of twenty 50 @-@ calibre 14 @-@ centimetre Type 3 . Eighteen of these were mounted in casemates in the forecastle and superstructure and the remaining pair were mounted on the deck above them and protected by gun shields . They had a maximum elevation of + 20 degrees which gave them ranges of 16 @,@ 300 metres ( 17 @,@ 800 yd ) . Each gun had a rate of fire of up to 10 rounds per minute . Anti @-@ aircraft defence was provided by four 40 @-@ calibre 3rd Year Type 8 @-@ centimetre AA guns in single mounts . The 7 @.@ 62 @-@ centimetre ( 3 in ) high @-@ angle guns had a maximum elevation of + 75 degrees , and had a rate of fire of 13 to 20 rounds per minute . They fired a 6 kg ( 13 lb ) projectile with a muzzle velocity of 680 m / s ( 2 @,@ 200 ft / s ) to a maximum height of 7 @,@ 500 metres ( 24 @,@ 600 ft ) . The ships were also fitted with six submerged 53 @.@ 3 @-@ centimetre ( 21 @.@ 0 in ) torpedo tubes , three on each broadside . They carried twelve to eighteen 6th Year Type torpedoes which had a 200 @-@ kilogram ( 440 lb ) warhead . They had three settings for range and speed : 15 @,@ 000 metres ( 16 @,@ 000 yd ) at 26 knots ( 48 km / h ; 30 mph ) , 10 @,@ 000 metres ( 11 @,@ 000 yd ) at 32 knots ( 59 km / h ; 37 mph ) , or 7 @,@ 000 metres ( 7 @,@ 700 yd ) at 37 knots ( 69 km / h ; 43 mph ) . + In 1931 – 33 the AA guns were replaced with eight 40 @-@ caliber 12 @.@ 7 cm ( 5 @.@ 0 in ) Type 89 dual @-@ purpose guns , fitted on both sides of the forward superstructures in four twin @-@ gun mounts . When firing at surface targets , the guns had a range of 14 @,@ 700 metres ( 16 @,@ 100 yd ) ; they had a ceiling of 9 @,@ 440 metres ( 30 @,@ 970 ft ) at their maximum elevation of + 90 degrees . Their maximum rate of fire was 14 rounds a minute , but their sustained rate of fire was around eight rounds per minute . Two twin @-@ gun mounts for license @-@ built Vickers two @-@ pounder light AA guns were also added . These guns had a maximum elevation of + 80 degrees and a rate of fire of 200 rounds per minute . The pair of 14 cm guns on the upper deck were removed at this time . + During the mid @-@ 1930s reconstruction the torpedo tubes were removed and the Vickers two @-@ pounders were replaced by twenty license @-@ built Hotchkiss 25 mm Type 96 light AA guns in 10 twin @-@ gun mounts . This was the standard Japanese light AA gun during World War II , but it suffered from severe design shortcomings that rendered it a largely ineffective weapon . According to historian Mark Stille , the twin and triple mounts " lacked sufficient speed in train or elevation ; the gun sights were unable to handle fast targets ; the gun exhibited excessive vibration ; the magazine was too small , and , finally , the gun produced excessive muzzle blast " . These 25 @-@ millimetre ( 0 @.@ 98 in ) guns had an effective range of 1 @,@ 500 – 3 @,@ 000 metres ( 1 @,@ 600 – 3 @,@ 300 yd ) , and an effective ceiling of 5 @,@ 500 metres ( 18 @,@ 000 ft ) at an elevation of 85 degrees . The maximum effective rate of fire was only between 110 and 120 rounds per minute because of the frequent need to change the fifteen @-@ round magazines . In addition the forward pair of 14 cm guns in the forecastle were removed at this time and the maximum elevation of the remaining guns was increased to + 30 degrees . + + = = = Protection = = = + + The Ise @-@ class ships ' waterline protective belt had a maximum thickness of 299 mm ( 11 @.@ 8 in ) of Vickers cemented armour amidships ; below it was a strake of 100 mm ( 3 @.@ 9 in ) armour . The upper armoured deck consisted of two layers of high @-@ tensile steel 55 mm ( 2 @.@ 2 in ) thick and the lower armoured deck also consisted of two layers of high @-@ tensile steel , but only 30 mm ( 1 @.@ 2 in ) thick . The sides of this deck sloped downwards to meet the bottom of the lower strake of the belt armour . The ends of the belt armour were closed off by bulkheads that ranged in thickness from 203 to 102 mm ( 8 to 4 in ) . The turrets were protected with an armour thickness of 254 mm ( 10 in ) on the face and 76 mm on the roof . The casemate armour was 149 mm ( 5 @.@ 9 in ) thick and that of the barbettes was 299 mm thick rather than the originally planned 305 mm . The sides of the conning tower were 305 mm thick . + The Ise class were the only Japanese battleships to place the powder magazine above the shell magazine as the IJN wished to put as much space as possible between the highly flammable propellant and mine and torpedo detonations . The danger from plunging shells at long distances was not appreciated until the fatal magazine explosions of three British battlecruisers during the 1916 Battle of Jutland graphically demonstrated the point . To further protect the magazines the depth of the double bottom was increased to a total of 3 @.@ 58 metres ( 11 ft 9 in ) underneath the barbettes and magazines . Additionally , the vessels contained 660 watertight compartments to preserve buoyancy in the event of battle damage . In addition to the torpedo bulge added when the ships were modernized , the deck armour over the machinery and magazines was increased to a total thickness of 140 mm . Inside the original skin of the ships , two torpedo bulkheads were also added and the turret roofs were increased to a total of 152 millimetres ( 6 in ) of armour . + + = = = Fire control and sensors = = = + + While the details of the ship 's fire @-@ control instruments are not fully available , it is known that the ships were fitted with a fire @-@ control director after completion . No computer was fitted at that time and data from the rangefinders had to be processed manually . Turrets 2 , 3 , and 5 were built with imported 6 @-@ metre ( 19 ft 8 in ) Bausch & Lomb rangefinders . These were felt to be inferior to the British Barr & Stroud instruments used on other ships and were removed in 1920 . They were replaced by either the British rangefinders or domestically built instruments of 6 or 8 metres ( 19 ft 8 in or 26 ft 3 in ) length . In the late 1920s the fire @-@ control systems were upgraded and additional platforms were added to the foremast to accommodate them . A pair of directors for the 12 @.@ 7 cm AA guns were added , one on each side of the forward superstructure , in the early 1930s . The fire @-@ control systems were again upgraded in the mid @-@ 1930s and directors were added for the 25 mm AA guns . Both ships had 10 @-@ metre ( 32 ft 10 in ) rangefinders installed at the top of the pagoda mast at that time . Type 21 air @-@ search radars were installed aboard the sisters in mid @-@ 1942 . + + = = = Aircraft = = = + + Ise was briefly fitted with an aircraft flying @-@ off platform for a Mitsubishi 1MF3 fighter on Turret No. 2 in 1927 . It was replaced by a platform on Turret No. 5 for a Yokosuka E1Y reconnaissance floatplane in 1928 – 29 . A catapult and a collapsible 4 @-@ tonne ( 3 @.@ 9 @-@ long @-@ ton ) crane were fitted on the stern during the mid @-@ 1930s modernization , and the ships were equipped to operate three floatplanes , although no hangar was provided . The initial Nakajima E4N2 biplanes were replaced by Nakajima E8N2 biplanes in 1938 . + + = = Conversion to hybrid carriers = = + + The sinking of the British capital ships Prince of Wales and Repulse by Japanese land @-@ based aircraft on 10 December 1941 led the IJN to realize that battleships could not operate in the face of enemy aircraft and required friendly air support to protect them . The loss of four Japanese aircraft carriers during the Battle of Midway in June 1942 severely limited the ability of the IJN to provide any air cover and alternatives were sought . Earlier proposals to convert one or more battleships into carriers had been made and rejected at the beginning of the war , but they were revived after Midway . Plans for more elaborate conversions were rejected on the grounds of expense and , most critically , time , and the IJN settled on removing the rear pair of turrets and replacing them with a flight deck equipped with two catapults to launch floatplanes . The Ise @-@ class ships were selected for the conversion because Hyūga had suffered an explosion in Turret No. 5 in early May that virtually destroyed the turret and their Turret No. 6 could not elevate to the full + 43 degrees deemed necessary for the long @-@ range engagement anticipated by the IJN . The Fusōs were scheduled to follow once the first two were completed . + + = = = Armament changes = = = + + The rear turrets , the barbettes and their supporting structures were removed beginning in early 1943 and the openings in the middle deck were covered by 152 mm plates salvaged from the turret armour . All of the 14 cm guns were removed and the casemate openings sealed off . Four additional twin 12 @.@ 7 cm mounts were added , one pair abreast the funnel and the other abreast the conning tower . The original ten twin 25 mm gun mounts were replaced by triple mounts and nine new triple mounts were added , a total of 57 guns . Two each Type 94 and Type 95 AA directors were added to control the additional guns . The ammunition for these new guns was stored in the magazines originally used for the 14 cm guns and for Turret No. 5 . During 1944 , the ships ' AA defences were reinforced with an additional dozen triple and eleven single 25 mm gun mounts , for a total of 104 barrels , and a pair of Type 13 early warning radars were added . In September six 30 @-@ round AA rocket launchers were added on the sides of the flight deck . + + = = = Flight deck arrangements = = = + + A 70 @-@ metre @-@ long ( 229 ft 8 in ) flight deck was built above the stern and stretched forward to the rebuilt aft superstructure . The flight deck was 29 metres ( 95 ft 2 in ) wide at its forward end and 13 metres ( 42 ft 8 in ) at the stern . It overhung the stern and increased the overall length of the ships to 219 @.@ 62 metres ( 720 ft 6 in ) . A pair of rotating gunpowder @-@ propelled catapults were fitted on the sides of the hull , forward of the aft superstructure where they partially restricted the arc of fire of the two amidships turrets . They could launch aircraft up to 4 @,@ 600 kilograms ( 10 @,@ 100 lb ) in weight and required 30 seconds to launch each aircraft . The flight deck had eight permanent storage positions connected by rails to the catapults and the hydraulically operated aircraft lift that brought the aircraft up from the hangar below on the trolleys used to move the floatplanes about . Two aircraft were intended to be stowed on the catapults and three more in temporary positions on the flight deck for a total of thirteen . + The 40 @-@ metre @-@ long ( 131 ft 3 in ) hangar was 20 metres ( 65 ft 7 in ) wide forward and 11 metres ( 36 ft 1 in ) at the rear . It was 6 metres ( 19 ft 8 in ) high and designed to stow nine aircraft . It was fitted with fire fighting foam and carbon dioxide dispensers as a result of wartime experience . The ' T ' -shaped lift was 12 @.@ 1 metres ( 39 ft 8 in ) wide at its forward end and 6 @.@ 6 metres ( 21 ft 8 in ) wide at the its aft end . It was 12 @.@ 1 metres long and had a capacity of 6 tonnes ( 5 @.@ 9 long tons ) . Petrol storage tanks with a capacity of 76 tonnes ( 75 long tons ) were installed in the former magazine of Turret No. 6 to provide each aircraft with enough fuel for three sorties . To recover the aircraft the collapsible crane formerly on the stern was moved up to the port side of the flight deck . Another crane was intended on the starboard side , but it was never fitted . + The ships had an air group of 11 each of Yokosuka D4Y dive bombers ( Allied reporting name " Judy " ) and Aichi E16A reconnaissance aircraft ( Allied reporting name " Paul " ) Both aircraft had development problems and neither air group ever had all of its intended aircraft . Coupled with a shortage of trained pilots , neither ship ever used its aircraft during combat . + + = = = Other changes = = = + + After the loss of the fast battleship Hiei at the Naval Battle of Guadalcanal in late 1942 to rudder damage , the IJN decided to reinforce the protection of the steering compartment and to create an auxiliary steering compartment . The protection of the former was strengthened by the addition of a concrete wall at least 1 metre ( 3 ft 3 in ) in thickness and some of the armour removed from the turrets was used to protect the latter . The double bottom below the former positions of aft turrets was converted to hold fuel oil ; this increased the ships ' endurance to 9 @,@ 500 nautical miles ( 17 @,@ 600 km ; 10 @,@ 900 mi ) at a speed of 16 knots . A pair of Type 22 surface @-@ search radars were also fitted during the conversion . + The removal of the secondary armament , the rear turrets and their supporting structures was generally compensated by the addition of the flight deck , hangar , AA guns and more fuel , and the metacentric height increased .23 metres ( 9 @.@ 1 in ) to 2 @.@ 81 metres ( 9 ft 3 in ) at full load as a result of the reduction in the displacement by over 2 @,@ 000 tonnes ( 2 @,@ 000 long tons ) to 40 @,@ 444 tonnes ( 39 @,@ 805 long tons ) . This also reduced the draught to 9 @.@ 03 metres ( 29 ft 8 in ) . The overhang of the flight deck at the stern increased the overall length to 219 @.@ 62 metres ( 720 ft 6 in ) and the beam was slightly reduced to 31 @.@ 71 metres ( 104 ft 0 in ) . + + = = Ships = = + + + = = Service = = + + Upon commissioning , the sister ships were assigned to the 1st Battleship Division of the 1st Fleet . Hyūga had an explosion in one of her main gun turrets that killed 11 men and injured 25 in 1919 ; the following year she accidentally collided with and sank a schooner , losing two crewmen . Before the start of the Pacific War , both ships frequently exercised off the coasts of the Soviet Union , Korea and China in addition to training in Japanese waters . Ise hosted Edward , Prince of Wales , and his aide @-@ de @-@ camp Lieutenant Louis Mountbatten in 1922 during the prince 's visit to Japan . In Korea Bay when the 1923 Great Kantō earthquake struck , they sailed to Kyushu where they loaded supplies from for the victims on 4 September . Together with two other battleships and a pair of light cruisers , Ise sank the destroyer Yayoi in 1926 during gunnery practice . Ise 's AA armament was upgraded in 1931 and Hyūga 's two years later . The latter ship was modernized in 1934 – 36 and Ise in 1935 – 37 , both at Kure Naval Arsenal . During the Second Sino @-@ Japanese War , the sisters frequently patrolled the Chinese coast in support of the blockade imposed by Japan . In August 1937 Hyūga ferried two battalions of Special Naval Landing Forces to Port Arthur . Three years later , she served as the flagship for the Emperor of the puppet state of Manchukuo , Henry Pu @-@ yi , during his state visit to Japan in June 1940 . On 15 November the ships were transferred to the 2nd Battleship Division of the 1st Fleet . The sisters were refitted in late 1940 in preparation for war , which included the fitting of external degaussing coils and additional AA directors . + + = = = World War II = = = + + When Japan began the Pacific War on 8 December , the sisters sortied for the Bonin Islands with four other battleships and the light carrier Hōshō as distant cover for the fleet attacking Pearl Harbor , and returned six days later . On 11 March 1942 Ise and Hyūga sortied from their anchorage at Hashirajima to join the unsuccessful search for the American carrier force that had attacked Marcus Island a week earlier . Similarly they pursued but did not catch the American carriers that had launched the Doolittle Raid on 18 April . + During gunnery training on 5 May , there was a premature detonation in the left gun of Hyūga 's Turret No. 5 that disabled both guns and killed 51 crewmen . Both aft magazines were flooded to douse the resulting fire and save the ship . She received temporary repairs during which the turret was removed and replaced by a circular armour plate on which three triple 25 mm gun mounts were positioned . On 11 May a valve in Ise 's No. 2 engine room stuck in the open position and flooded the engine room . While under repair at Kure , both ships received prototype Type 21 radars . Commanded by Vice @-@ Admiral Shirō Takasu , the 2nd Battleship Division set sail with the Aleutian Support Group on 28 May , at the same time that most of the Imperial Fleet began an attack on Midway Island ( Operation MI ) . + They returned home on 14 June and the IJN began preliminary planning to replace the lost carriers with hybrid carriers converted from battleships . The sisters were selected for conversion and detached from the division on 14 July in preparation . They remained on " standby alert " until the actual conversions began . Ise was converted at Kure Naval Arsenal from 23 February to 5 September 1943 and Hyūga at Sasebo Naval Arsenal from 2 May to 30 November . + After completing her sea trials , Ise was attached to the Imperial Japanese Naval Academy at Etajima and ferried troops and munitions to the naval base at Truk in October . In November the ship began working up , joined by the newly completed Hyūga the following month , and both rejoined the 2nd Battleship Division . On 1 May 1944 , the sisters were transferred to Rear Admiral Matsuda Chiaki 's reformed Fourth Carrier Division of the 3rd Fleet . The division 's 634th Naval Air Group was formed that same day and conducted its first catapult launches in late June . + + = = = = Battle of Cape Engaño = = = = + + Shortages of aircraft and serviceability problems greatly retarded pilot training and the ships only had a total of 17 D4Ys and 18 E16As on hand on 1 October ; of these , only 6 and 16 were operational , respectively . The Japanese plan for the defence of the Philippines envisioned that the surviving carriers would be used to lure the American carrier forces away from the invasion area to a position where the carriers could be attacked by land @-@ based aircraft and the transports by the rest of the IJN . The other carrier air groups were not in much better shape and the Japanese decided to retain the aircraft ashore for use against the American carriers . The Fourth Carrier Division was assigned to the Northern Force under the command of Vice Admiral Jisaburō Ozawa and the sisters sailed from Yashima on 20 October . On the morning of 24 October , the bulk of the few aircraft aboard were launched to attack the American carriers as a distraction . They inflicted no damage and caused the Americans to search in the direction from which they had attacked . The Americans finally spotted the Japanese carriers at 16 : 40 , some 200 miles ( 320 km ) east of Cape Engaño , the northeastern tip of Luzon . The American carriers were spread out and it was very late in the day to launch an airstrike , so Admiral William Halsey , commander of the Third Fleet decided to mass his carriers in a position to attack the following morning . Ozawa reversed course during the night , correctly believing that the Americans would follow him north . + Although they had lost contact during the night , the Americans did find the Japanese carriers at 07 : 35 . They had already launched an airstrike of 180 aircraft that was orbiting 50 miles ( 80 km ) ahead of the American carriers while waiting for the Japanese ships to be located . This was just the first of a total of five airstrikes that the Americans launched that day . The sisters were not heavily engaged by the early airstrikes which are focusing on the group 's aircraft carriers . Ise claimed to have shot down five attacking dive bombers from the second wave and one small bomb detonated on Turret No. 2 . Hyūga was lightly damaged by near misses that rupture some hull plating in her bulge and pepper her superstructure with splinters . She took on a 5 @-@ degree list that was quickly corrected before she was ordered to tow the crippled carrier Chiyoda to safety . Her attempt was unsuccessful and Chiyoda had to be abandoned to her fate . + Ise was attacked by 80 @-@ odd aircraft from the fourth wave , but they failed to inflict any serious damage . She dodged 11 torpedoes and was only hit by a bomb once , on the bulge outboard of the port catapult . Some 34 other bombs near missed her , spraying her with splinters and ruptured some hull plates that contaminated some fuel oil and caused leaks in her port boiler rooms . While an exact total of her casualties is not available , it has been estimated that 5 men were killed and some 111 – 121 crewmen were wounded during this attack . Hyūga was unsuccessfully attacked by an American submarine at 18 : 43 . Around 19 : 00 Ozawa learned about a force of destroyers and cruisers that drove off the Japanese destroyers rescuing survivors from some of the carriers lost earlier in the day and sank Chiyoda . He ordered the Fourth Carrier Division to reverse course and engage the Americans , but the battleships were unable to find them , and Ozawa ordered them to reverse course and head for Amami Ōshima . When they arrived on 27 October , Ozawa transferred to Hyūga and hoisted his flag aboard her . While en route for Kure , the division was unsuccessfully attacked by another submarine . + In early November the catapults were removed from both ships , and they loaded troops and munitions later that month . While en route they were diverted to the Spratly Islands upon reports of heavy air raids at Manila . After off @-@ loading their cargo , they sailed for Lingga Island , near Singapore , on 20 November . They transferred to Cam Ranh Bay , French Indochina and Hyūga became flagship of the 5th Fleet there on 14 December . The division sailed for Singapore on 30 December and Vice Admiral Kiyohide Shima transferred his flag to the light cruiser Ōyodo on arrival there the following day . The division continued onwards to Lingga . Its planned return to Japan was delayed by attacks by the American Third Fleet on targets in Indochina and southern China that sank two oil tankers that were intended to refuel the division . + The IJN then decided to use the sisters and their escorts to bring a load of petrol , rubber , tin and other strategic minerals back to Japan after the American carriers departed the South China Sea ( Operation Kita ) . They loaded their cargoes beginning on 6 February at Singapore and departed four days later . Also carrying some 1 @,@ 150 oilfield workers , they were escorted by Ōyodo and three destroyers . Decrypted Japanese radio signals revealed the Japanese plan to the Allies , and 15 submarines were positioned along their anticipated route in an attempt to intercept and sink the ships . An additional 11 were moved into position while the group was en route , but only three were ultimately able to attack . None of them were successful before the Japanese reached Kure on 20 February . The Fourth Carrier Division was disbanded on 1 March and the sisters were reduced to 1st rank reserve ships . On 19 March Kure was attacked by aircraft from Task Force 58 and Hyūga was hit three times by bombs that killed 37 men and wounded 52 . Her gunners claimed to have shot down one American dive bomber during the attack . Ise was hit twice during the attack , but her casualties , if any , are unknown . + The ships were turned into floating AA batteries over the next several months although it availed them little when they were attacked again by American carrier aircraft in July . On the 24th Ise was struck by five bombs and near missed multiple times ; all told she lost 50 crewmen killed and many others wounded . The bombs started numerous leaks and Ise began to settle by the bow , although she was returned to an even keel after three @-@ days pumping . Hyūga was a primary focus of the attack and she received 10 direct hits and up to 30 near misses . She was badly damaged with some 200 @-@ odd crewmen killed and 600 wounded during the attack . She slowly foundered over the next two days and was not attacked when the Americans returned four days later . This time it was Ise 's turn and she was struck 11 or more times with many near misses that put her on the bottom in shallow water with a 15 degree list . The sisters were struck off the Navy List in November and their wrecks were scrapped after the war . + + + = Dick Rifenburg = + + Richard Gale " Dick " Rifenburg ( August 21 , 1926 – December 5 , 1994 ) was an American football player and a pioneering television broadcaster for the forerunner to WIVB @-@ TV in Buffalo . He played college football for the University of Michigan Wolverines in 1944 and from 1946 to 1948 . He was a consensus selection at end on the 1948 College Football All @-@ America Team . Rifenburg played professionally in the National Football League ( NFL ) with the Detroit Lions for one season in 1950 . After retiring from football he settled in Buffalo and became a sports broadcaster . He worked as a color commentator and as a play @-@ by @-@ play announcer for the Buffalo Bulls . He hosted various television and radio sports shows and was eventually inducted into the Buffalo Broadcasters Hall of Fame . + In college , he led the Big Ten Conference in single season receptions during his senior year and set Michigan Wolverines receptions records for both career touchdown and single @-@ season touchdowns . He had also been a Michigan High School Athletic Association ( MHSAA ) state champion in both basketball and track and field . His college career was interrupted by World War II service , and his high school career was also affected by the war due to the MHSAA 's cancellation of state championships in all sports in 1943 . + + = = High school = = + + Rifenburg was born in Petoskey , Michigan , and raised in Kalamazoo , Michigan before his family moved to Saginaw , Michigan . Rifenburg was a star athlete at Saginaw 's Arthur Hill High School in football , basketball , and track and field . In 1943 , Michigan canceled boys high school tournaments in all sports due to World War II , and they did not return until the fall of 1944 . In 1944 , he led Arthur Hill High to the MHSAA Class A high school basketball championship ( over Kalamazoo Central High School ) , scoring 24 points , including 17 in the second half , of the championship game . Rifenburg was also the state champion in 1944 in both the shot put 46 feet 11 inches ( 14 @.@ 30 m ) and high jump 5 feet 8 @.@ 5 inches ( 1 @.@ 74 m ) . He also led Arthur Hill in football , and his high school accomplishments are featured in Glory : The history of Saginaw County sports by Jack Tany ( ASIN B0006RH9Z6 ) , which is a book on high school sports in Saginaw County , Michigan . Rifenburg was named All State in football , basketball and track . + It is ironic that Rifenburg was born in Petoskey , Michigan in 1926 for several reasons . Ted Petoskey preceded Rifenburg as an All @-@ American end on the University of Michigan football team . Petoskey had excelled as a representative of Saginaw County in MHSAA competition . Petoskey posted significant football accomplishments in 1926 making 1926 a significant year for himself as well . Achieving All @-@ American status as an end at Michigan would be Rifenburg 's next step after excelling in MHSAA competition . + + = = College = = + + In the fall of 1944 , Rifenburg enrolled at the University of Michigan . The United Press syndicate ran a feature article about Rifenburg in September 1944 that opened as follows : " Another great end has made his appearance on the Big Ten football horizon in the person of Dick Rifenburg , 18 @-@ year @-@ old Michigan freshman . Every so often a great offensive end comes along , a player who has to learn how to play defense , but who has the natural speed , smooth actions , height and big hands that is the mark of an outstanding pass receiver . Rifenburg has laid claim to that rating . A loose @-@ limbed 180 @-@ pound freshman from Saginaw , Mich . , Rifenberg is being boomed as the Big Ten 's next ' freshman sensation . ' " As a freshman , he caught two touchdown passes in his first college football game against Iowa . In an article titled " Teens and TNT , " Time reported on Rifenburg 's performance : " Of the few teams already in action , Michigan 's teens rang the freshman bell loudest last week by winning their opener , 12 -to @-@ 7 , against the strong Iowa Seahawks ( Naval Pre @-@ Flight ) ; 6 @-@ ft . 4 Freshman End Dick Rifenburg caught passes and ran for both Michigan touchdowns . " + Rifenburg 's college career was interrupted by World War II service in the United States Navy , but after missing the 1945 season , he returned to play for the Wolverines from 1946 to 1948 . Rifenburg played for the Wolverines in consecutive undefeated National Championship seasons in 1947 and 1948 . He started nine games for the 1947 team . The 1947 team referred to as " Michigan 's Mad Magicians " is considered to be the greatest University of Michigan football team of all time . Rifenberg and teammate Len Ford had the reputation as the team practical jokers . During the 1947 game against Wisconsin , Rifenburg started calling signals for the Badgers . Wisconsin 's offense protested to officials , who " prowled the Wolverines secondary but never caught their man . " Rifenburg continued to scramble Badger signals , as Rifenburg 's teammates laughed at his scheme . In the January 1 , 1948 Rose Bowl that season , Michigan rolled to a 49 – 0 victory over USC , and they outgained the Trojans 491 yards to 133 . Rifenburg caught a 29 @-@ yard pass for the game 's final score . + In the 1948 championship season , Rifenburg scored eight touchdowns , caught 22 passes , and gained 610 yards ( 508 receiving and 102 rushing ) . Rifenburg was the second highest scoring end in the nation in 1948 , and he was a consensus All @-@ American as a senior , being selected as first team on nine of the 11 All @-@ American teams . Rifenburg led the Big Ten in receptions . + Although Rifenburg finished fourth among midwestern Heisman voters in 1948 , he did not finish among the top eight . By comparison , Notre Dame end Leon Hart won the Heisman Trophy in 1949 but made only eight of the 11 All @-@ American teams . It is not clear why Rifenburg did not finish higher . However , it is fairly clear that sportswriters of that era had a bias against Michigan . In the Associated Press poll at the end of the 1947 season , the Notre Dame Fighting Irish were ranked ahead of the University of Michigan , though both teams were undefeated . Some noted that every Southern AP voter had voted for Notre Dame , which had yet to integrate , whereas three of Michigan 's star players ( Bob Mann , Gene Derricotte , and Len Ford ) were African @-@ American . The Southern schools refused even to schedule games against schools that played African @-@ American players . + Rifenburg was considered one of the greatest Wolverine 's of the 1940s . In four seasons with the Michigan Wolverines , Rifenburg played in 32 games and had over 1 @,@ 000 yards of total offense . Rifenburg held the University of Michigan 's single season and career record for touchdown receptions ( eight in a season ; sixteen career ) until his records were broken by Anthony Carter in 1980 . + + = = Professional career = = + + In 1948 , Rifenburg was drafted by the Philadelphia Eagles in the 15th round of the NFL draft , and he was also drafted by the New York Yankees of the All @-@ America Football Conference . He had intended to play in 1949 with the Yankees , but suffered a knee injury in a practice session for the August 1949 Chicago College All @-@ Star Game . Press accounts at the time noted that the injury " will probably keep him out of pro football all season , if not forever . " The incident led to a debate as to whether NFL owners should " bar their men from playing with the college all @-@ stars . " + Rifenburg landed a job at WJR radio in Detroit , but he left his sportscaster 's job to join the Detroit Lions . In the 1950 NFL season , Rifenburg came back from his injury to play for the Detroit Lions . He played in 12 games and had ten receptions for 96 yards and one touchdown for the 1950 Lions . Rifenburg recalled that his playing time with the Lions was limited because the Lions also signed 1949 Heisman Trophy winner Leon Hart , who played the same position . + In May 1951 , he announced he was retiring from professional football to become sports director at a radio station in Buffalo . He was hired as a sportscaster by WBEN ( now known as WIVB ) , which had just started the first television station in Buffalo and the only one serving Southern Ontario . This was an early foray into television by the Buffalo Evening News . In the 1950s , Rifenburg hosted a popular panel show called " Let 's Talk Sports " in Buffalo and also pioneered an early morning exercise program . He also worked for WBEN ( AM ) and WBEN ( FM ) and as the sideline announcer for Buffalo Bills games along with Van Miller , the long time Bills play @-@ by @-@ play announcer . In addition , he served as the play @-@ by @-@ play announcer for the University of Buffalo Bulls football team . As a radio broadcaster , he is remembered for things ranging from ski reports , to 17 years worth of " Breakfast At — " programs live from various local restaurants , to 27 years as the WBEN @-@ AM All Night Show host . + After 30 years with WBEN and a change in ownership for the station , his show was replaced with the Mutual Network 's The Larry King Show . In the 1980s , Rifenburg taught communications at Buffalo 's Medaille College and served as a disc jockey on Public Broadcasting 's radio station WEBR ( now WDCZ ) . He also sold ads for Buffalo Evening News competitor , Buffalo Courier @-@ Express . Rifenburg 's final employer was Erie County , who hired him as an inmate training supervisor at the Erie County Correctional Facility . + Rifenburg was posthumously inducted into the Buffalo Broadcasters Hall of Fame in September 2007 . He was given the Golden Age Award which is reserved for " those who did it first , the people who had no pattern to follow . " The Hall of Fame award was presented to Rifenburg 's wife , Jane . In her acceptance speech , Jane Rifenburg observed that despite all of her late husband 's achievements , there was one thing he had never received : " He had a great career , but he never had a trophy . And now he has . " + + = = Family = = + + Rifenburg lived 37 of his years in Buffalo . His wife , the former Jane Morris , was the head of the Buffalo Jills cheerleaders when they met . Rifenburg , who was survived by three sons , ( Douglas A. , Gary R. , and Bruce R. ) one daughter ( Wendy J. Colf ) and two grandchildren , died in Cheektowaga , New York in December 1994 ; he was 68 years old . Doug was a 1988 first team football All @-@ Western New York linebacker for Clarence High School . + + + = 1933 Treasure Coast hurricane = + + The 1933 Treasure Coast hurricane was the second @-@ most intense tropical cyclone to strike the United States during the active 1933 Atlantic hurricane season . The eleventh tropical storm , fifth hurricane , and the third major hurricane of the season , it formed east @-@ northeast of the Leeward Islands on August 31 . The tropical storm moved rapidly west @-@ northwestward , steadily intensifying to a hurricane . It acquired peak winds of 140 miles per hour ( 225 km / h ) and passed over portions of the Bahamas on September 3 , including Eleuthera and Harbour Island , causing severe damage to crops , buildings , and infrastructure . Winds over 100 mph ( 161 km / h ) affected many islands in its path , especially those that encountered its center , and many wharves were ruined . + Subsequently , it weakened and made landfall at Jupiter , Florida , early on September 4 with winds of 125 mph ( 201 km / h ) . The hurricane moved across the state , passing near Tampa before moving into Georgia and dissipating . In Florida , the strong winds of the cyclone blew buildings off their foundations , and numerous trees were prostrated in citrus groves . The Treasure Coast region received the most extensive destruction , and Stuart , Jupiter , and Fort Pierce were heavily damaged . Inland , the cyclone weakened rapidly but produced prodigious amounts of rain , causing a dam to collapse near Tampa . The storm caused $ 3 million in damage ( 1933 USD ) after damaging or destroying 6 @,@ 848 homes . + Unusually , the storm hit Florida less than 24 hours before another major hurricane bearing 125 @-@ mile @-@ per @-@ hour ( 201 km / h ) winds struck South Texas ; never have two major cyclones hit the United States in such close succession . + + = = Meteorological history = = + + The origins of the hurricane were from a tropical wave that possibly spawned a tropical depression on August 27 , although there was minimal data over the next few days as it tracked to the west @-@ northwest . On August 31 , a nearby ship reported gale force winds , which indicated that a tropical storm had developed to the east @-@ northeast of the Lesser Antilles . Based on continuity , it is estimated the storm attained hurricane status later that day . Moving quickly to the west @-@ northwest , the storm passed north of the Lesser Antilles and Puerto Rico . Early on September 2 , a ship called the Gulfwing reported a barometric pressure of 978 mbar ( 28 @.@ 88 inHg ) , which confirmed that the storm attained hurricane status . After passing north of the Turks and Caicos islands , the hurricane struck Eleuthera and Harbour Island in the Bahamas on September 3 , the latter at 1100 UTC . A station on the latter island reported a pressure of 27 @.@ 90 inHg ( 945 mb ) during the 30 minute passage of the eye . Based on the pressure and the small size of the storm , it is estimated the hurricane struck Harbour Island with peak winds of 140 mph ( 225 km / h ) , making it the equivalent of a modern Category 4 hurricane on the Saffir @-@ Simpson scale . Interpolation suggested that the storm reached major hurricane status , or Category 3 status , on September 2 . + The hurricane initially followed the course of another hurricane that passed through the area in late August , which ultimately struck Cuba and Texas . This hurricane instead maintained a general west @-@ northwest track . After moving through the northern Bahamas , the hurricane weakened slightly before making landfall at Jupiter , Florida , at 0500 UTC on September 4 . A station there reported a pressure of 27 @.@ 98 inHg ( 948 mb ) during a 40 minute period of the eye 's passage ; this suggested a landfall strength of 125 mph ( 201 km / h ) . At the time , the radius of maximum winds was 15 mi ( 24 km ) , which was smaller than average . After landfall , the hurricane weakened rapidly while crossing the state . It briefly emerged into the Gulf of Mexico as a tropical storm early on September 5 . A few hours later while continuing to the northwest , it made another landfall near Rosewood — a ghost town in Levy County , east of Cedar Key — with winds of about 65 mph ( 105 km / h ) . Turning to the north , the storm slowly weakened as it crossed into Georgia , dissipating on September 7 near Augusta . + + = = Preparations and impact = = + + On September 2 , a fleet of eight aircraft evacuated all white residents from West End , Grand Bahama , to Daytona Beach , Florida . While the storm was near peak intensity on September 3 , the Weather Bureau issued hurricane warnings from Miami to Melbourne , Florida , with storm warnings extending northward to Jacksonville . Later that day , storm warnings , were issued from Key West to Cedar Key . About 2 @,@ 500 people evacuated by train from areas around Lake Okeechobee . By evening on September 3 , high tides sent sea spray over coastal seawalls in Palm Beach County as residents boarded up buildings ; structures on Clematis Street in West Palm Beach were said to be a " solid front " of plywood . Along the coast , observers reported very rough seas as the eye neared land . + The powerful hurricane moved over or near several islands in the Bahamas . Winds on Spanish Wells and Harbour Island were both estimated at around 140 mph ( 225 km / h ) . Winds reached 110 mph ( 177 km / h ) at Governor 's Harbour , 100 mph ( 161 km / h ) on Eleuthera , and 120 mph ( 193 km / h ) on the Abaco Islands . The storm was farther away from Nassau , where winds reached 61 mph ( 98 km / h ) . The hurricane damaged a lumber mill on Abaco , washing away a dock . Heavy damage occurred on Harbour Island , including to several roofs , the walls of government buildings , and the water system . The hurricane destroyed four churches and 37 houses , leaving 100 people homeless . A 1 @.@ 5 mi ( 2 @.@ 4 km ) road on Eleuthera was destroyed . Several islands sustained damage to farms , including the total loss of various fruit trees on Russell Island . Despite Category 4 winds on Spanish Wells , only five houses were destroyed , although most of the remaining dwellings lost their roofs . Collectively between North Point , James Cistern , and Gregory Town on Eleuthera , the storm destroyed 55 houses and damaged many others . On Grand Bahama , where a 9 to 12 ft ( 2 @.@ 7 to 3 @.@ 7 m ) storm surge was reported , half of the houses were destroyed , as were 13 boats and two planes , and most docks were wrecked . + When the storm moved ashore in Florida , winds reached an estimated 125 mph ( 201 km / h ) in Jupiter ; these occurred after the eye passed . In West Palm Beach , anemometers measured at least 80 @-@ mile @-@ per @-@ hour ( 129 km / h ) winds with gusts to 100 mph ( 161 km / h ) ; barometers ranged from 28 @.@ 64 to 28 @.@ 78 inHg ( 970 to 975 mb ) . The storm produced the strongest winds in the city since the 1928 Okeechobee hurricane . Winds were not as strong farther from the center ; 40 to 45 mph ( 64 to 72 km / h ) winds were observed in Miami to the south , Titusville to the north , and Tampa on the west coast . Fort Pierce estimated peak winds of 80 to 90 mph ( 129 to 145 km / h ) , and pressures dipped to 29 @.@ 14 inHg ( 987 mb ) . Inland , winds near Lake Okeechobee peaked at only 60 mph ( 97 km / h ) . The hurricane dropped heavy rainfall along its path , peaking at 17 @.@ 8 in ( 450 mm ) in Clermont . + At West Palm Beach , the majority of the damage was confined to vegetation . Several coconut and royal palms that withstood the 1928 hurricane snapped , littering streets with broken trunks . Winds downed road signs on many streets , and floodwaters covered the greens on a local golf course . Some garages and isolated structures , mostly lightweight , were partly or totally destroyed , along with a lumber warehouse . Some homes that lost roofing shingles had water damage to their interiors as well . Nearby Lake Worth sustained extensive breakage of windows , including plate glass , and loss of tile and shingle roofing , but preparations reduced losses to just several thousand dollars , and no post @-@ storm accidents took place . Strong winds snapped many light poles in the city , and trees and shrubs were broken or uprooted . As in Lake Worth , officials in West Palm Beach credited preparations and stringent building codes with reducing overall damage . The city had learned from previous experience with severe storms in 1926 , 1928 , and 1929 . High tides eroded Ocean Boulevard at several spots and disrupted access to several bridges on the Lake Worth Lagoon . Winter estates and hotels on Palm Beach generally sustained little material damage , except to vegetation , and county properties went largely unscathed . + In Martin and St. Lucie counties , the storm was considered among the worst on record . The storm leveled some homes and swept many others off their foundations . At Stuart , winds removed or badly damaged 75 % of the roofs in town . The storm destroyed the third floor of the building that housed a bowling alley and the Stuart News , a local newspaper . At Olympia , an abandoned settlement also known as Olympia Beach , strong winds leveled the old Olympia Inn , a gas station , and the second floor of a pharmaceutical building . Winds also tore the roof off an ice plant . A bridge leading to the barrier island from Olympia was partly wrecked ; the bridge tender survived by gripping the railing during the storm . Winds leveled his nearby home . According to the Monthly Weather Review , some of the most severe damage from the storm in Florida was at Olympia . The storm left many homes in Hobe Sound uninhabitable , forcing crews to tear them down . Winter estates on the island , however , were better built and little damaged . While Stuart and Hobe Sound sustained significant damage , Port Salerno suffered minimally . In Stuart , the storm left 400 to 500 people homeless , up to nearly 10 % of the population , which was 5 @,@ 100 at the time . Between Jupiter and Fort Pierce , the storm knocked down power and telegraph lines . In the latter city , high waves washed out a portion of the causeway . In the 1980s , an elderly resident recalled that the storm was the most severe on record in Fort Pierce . + Crop damage was worst along the Indian River Lagoon ; several farms in Stuart experienced total losses , and statewide , 16 % of the citrus crop , or 4 million boxes , were destroyed . Many chicken coops in Stuart were destroyed , and the local chicken population was scattered and dispersed as far as Indiantown . Across southeastern Florida , the hurricane damaged 6 @,@ 465 houses and destroyed another 383 , causing over $ 3 million in damage . One person , an African American farm worker , was killed when his shack blew down in Gomez , a brakeman died after seven railcars derailed , and a child was killed by airborne debris . + High rainfall caused flooding across Florida , notably near Tampa where waters reached 9 ft ( 2 @.@ 7 m ) deep . High rainfall of over 7 in ( 180 mm ) caused a dam operated by Tampa Electric Co. to break 3 mi ( 4 @.@ 8 km ) northeast of Tampa along the Hillsborough River . The break resulted in severe local damage , flooding portions of Sulphur Springs . Workers attempted to save the dam with sandbags , and after the break , most residents in the area were warned of the approaching flood . Over 50 homes were flooded , forcing about 150 people to evacuate . Outside Florida , the storm produced winds of 48 and 51 mph ( 78 and 81 km / h ) in Savannah , Georgia and Charleston , South Carolina , respectively . In the latter city , the storm spawned a tornado , which caused about $ 10 @,@ 000 in property damage . Heavy rainfall occurred along the Georgia and South Carolina coasts , reaching over 12 in ( 300 mm ) . Light rainfall also extended into North Carolina . + + = = Aftermath = = + + In the Bahamas after the storm , a boat sailed from Nassau to deliver food and building materials to Eleuthera . + After the storm , the National Guard offered shelters for at least 400 homeless residents in Stuart . Of the 7 @,@ 900 families adversely affected by the hurricane , 4 @,@ 325 required assistance from the American Red Cross . Farmers in Texas , also affected by a major hurricane , requested growers in Florida wait 15 days so they could sell their citrus crop that fell . The damaged dam near Tampa initially resulted in waters from the Hillsborough River being pumped into the city 's water treatment plant , and a new dam was eventually built in 1944 . + + + = Second Battle of Naktong Bulge = + + The Second Battle of Naktong Bulge was an engagement between United Nations ( UN ) and North Korean ( NK ) forces early in the Korean War from September 1 to September 15 , 1950 , along the Naktong River in South Korea . It was a part of the Battle of Pusan Perimeter , and was one of several large engagements fought simultaneously . The battle ended in a victory for the United Nations after large numbers of United States ( US ) and Republic of Korea ( ROK ) troops repelled a strong North Korean attack . + After the First Battle of Naktong Bulge , the US Army 's 2nd Infantry Division was moved to defend the Naktong River line . The division , which was untried in combat , was struck with a strong attack by several divisions of the Korean People 's Army which crossed the river and struck all along the division 's line . The force of the attack split the US 2nd Infantry Division in half , and the North Koreans were able to penetrate to Yongsan , promoting a fight there . + The urgency of the threat to Pusan Perimeter prompted the US Marine Corps 1st Provisional Marine Brigade to be brought in to reinforce the US Army troops . In two weeks of heavy fighting , the US forces were able to force the North Koreans out of the Naktong Bulge region . The North Koreans were further repulsed after the UN counterattack at Inchon , which culminated in the virtual destruction of the North Korean army . + + = = Background = = + + + = = = Pusan Perimeter = = = + + From the outbreak of the Korean War and the invasion of South Korea by the North , the North Korean People 's Army had enjoyed superiority in both manpower and equipment over both the Republic of Korea Army and the United Nations forces dispatched to South Korea to prevent it from collapsing . The North Korean strategy was to aggressively pursue UN and ROK forces on all avenues of approach south and to engage them aggressively , attacking from the front and initiating a double envelopment of both flanks of the unit , which allowed the North Koreans to surround and cut off the opposing force , which would then be forced to retreat in disarray , often leaving behind much of its equipment . From their initial June 25 offensive to fights in July and early August , the North Koreans used this strategy to effectively defeat any UN force and push it south . However , when the UN forces , under the Eighth United States Army , established the Pusan Perimeter in August , the UN troops held a continuous line along the peninsula which North Korean troops could not flank , and their advantages in numbers decreased daily as the superior UN logistical system brought in more troops and supplies to the UN army . + When the North Koreans approached the Pusan Perimeter on August 5 , they attempted the same frontal assault technique on the four main avenues of approach into the perimeter . Throughout August , the NK 6th Division , and later the NK 7th Division engaged the US 25th Infantry Division at the Battle of Masan , initially repelling a UN counteroffensive before countering with battles at Komam @-@ ni and Battle Mountain . These attacks stalled as UN forces , well equipped and with plenty of reserves , repeatedly repelled North Korean attacks . North of Masan , the NK 4th Division and the US 24th Infantry Division sparred in the Naktong Bulge area . In the First Battle of Naktong Bulge , the North Korean division was unable to hold its bridgehead across the river as large numbers of US reserve forces were brought in to repel it , and on August 19 , the NK 4th Division was forced back across the river with 50 percent casualties . In the Taegu region , five North Korean divisions were repulsed by three UN divisions in several attempts to attack the city during the Battle of Taegu . Particularly heavy fighting took place at the Battle of the Bowling Alley where the NK 13th Division was almost completely destroyed in the attack . On the east coast , three more North Korean divisions were repulsed by the South Koreans at P 'ohang @-@ dong during the Battle of P 'ohang @-@ dong . All along the front , the North Korean troops were reeling from these defeats , the first time in the war their strategies were not working . + + = = = September push = = = + + In planning its new offensive , the North Korean command decided any attempt to flank the UN force was impossible thanks to the support of the UN navy . Instead , they opted to use frontal attack to breach the perimeter and collapse it as the only hope of achieving success in the battle . Fed by intelligence from the Soviet Union the North Koreans were aware the UN forces were building up along the Pusan Perimeter and that it must conduct an offensive soon or it could not win the battle . A secondary objective was to surround Taegu and destroy the UN and ROK units in that city . As part of this mission , the North Korean units would first cut the supply lines to Taegu . + On August 20 , the North Korean commands distributed operations orders to their subordinate units . The North Koreans called for a simultaneous five @-@ prong attack against the UN lines . These attacks would overwhelm the UN defenders and allow the North Koreans to break through the lines in at least one place to force the UN forces back . Five battle groupings were ordered . The center attack called for the NK 9th Division , NK 4th Division , NK 2nd Division , and NK 10th Division break through the US 2nd Infantry Division at the Naktong Bulge to Miryang and Yongsan . + + = = Battle = = + + During the North Koreans ' September 1 offensive , the US 25th Infantry Division 's US 35th Infantry Regiment was heavily engaged in the Battle of Nam River north of Masan . On the 35th Regiment 's right flank , just north of the confluence of the Nam River and the Naktong River , was the US 9th Infantry Regiment , US 2nd Infantry Division . There , in the southernmost part of the 2nd Infantry Division zone , the 9th Infantry Regiment held a sector more than 20 @,@ 000 yards ( 18 @,@ 000 m ) long , including the bulge area of the Naktong where the First Battle of Naktong Bulge had taken place earlier in August . Each US infantry company on the river line here had a front of 3 @,@ 000 feet ( 910 m ) to 4 @,@ 000 feet ( 1 @,@ 200 m ) and they held only key hills and observation points , as the units were extremely spread out along the wide front . + During the last week of August , US troops on these hills could see minor North Korean activity across the river , which they thought was North Koreans organizing the high ground on the west side of the Naktong against a possible American attack . There were occasional attacks on the 9th Infantry 's forward positions , but to the men in the front lines this appeared to be only a standard patrol action . On August 31 , the UN forces were alerted to a pending attack when much of the Korean civilian labor force fled the front lines . Intelligence officers reported an attack was coming . + On the west side of the Naktong , North Korean Major General Pak Kyo Sam , commanding the NK 9th Division , issued his operations order to the division on August 28 . Its mission in the forthcoming attack was to outflank and destroy the US troops at Naktong Bulge by capturing the Miryang and Samnangjin areas to cut off the US 2nd Division 's route of supply and withdrawal between Taegu and Pusan . However , the North Koreans weren 't aware that the US 2nd Infantry Division had recently replaced the US 24th Infantry Division in positions along the Naktong River . Consequently , they expected lighter resistance ; the 24th troops were exhausted from months of fighting but the 2nd Division men were fresh and newly arrived in Korea . They had only recently been moved into the line . The North Koreans began crossing the Naktong River under cover of darkness at certain points . + + = = = Battle of Agok = = = + + On the southern @-@ most flank of the 9th Infantry river line , just above the junction of the Nam River with the Naktong , A Company of the 1st Battalion was dug in on a long finger ridge paralleling the Naktong that terminates in Hill 94 at the Kihang ferry site . The river road from Namji @-@ ri running west along the Naktong passes the southern tip of this ridge and crosses to the west side of the river at the ferry . A small village called Agok lay at the base of Hill 94 and 300 yards ( 270 m ) from the river . A patrol of tanks and armored vehicles , together with two infantry squads of A Company , 9th Infantry , held a roadblock near the ferry and close to Agok . On the evening of August 31 , A Company moved from its ridge positions overlooking Agok and the river to new positions along the river below the ridge line . + That evening Sergeant Ernest R. Kouma led the patrol of two M26 Pershing tanks and two M19 Gun Motor Carriages in Agok . Kouma placed his patrol on the west side of Agok near the Kihang ferry . At 20 : 00 a heavy fog covered the river , and at 22 : 00 mortar shells began falling on the American @-@ held side of the river . By 22 : 15 this strike intensified and North Korean mortar preparation struck A Company 's positions . American mortars and artillery began firing counterbattery . Some of the A Company men reported they heard noises on the opposite side of the river and splashes in the water . + At 22 : 30 the fog lifted and Kouma saw that a North Korean pontoon bridge was being laid across the river directly in front of his position . Kouma 's four vehicles attacked this structure , and after about a minute of heavy fire the bridge collapsed , and the ponton boats used to hold the bridge in place were sunk . At 23 : 00 a small arms fight flared around the left side of A Company north of the tanks . This gunfire had lasted only two or three minutes when the A Company roadblock squads near the tanks received word by field telephone that the company was withdrawing to the original ridge positions and that they should do likewise . + Kouma 's patrol was then ambushed by a group of North Koreans dressed in US military uniforms . Kouma was wounded and the other three vehicles had to withdraw , but he held the Agok site until 07 : 30 the next morning with his single tank . In the attack against A Company , the North Koreans hit the 1st Platoon , which was near Agok , but they did not find the 2nd Platoon northward . + The NK 9th Division 's infantry crossing of the Naktong and attack on its east side near midnight quickly overran the positions of C Company , north of A Company . There the North Koreans assaulted in force , signaled by green flares and blowing of whistles . The company held its positions only a short time and then attempted to escape . Many of the men moved south , a few of them coming into A Company 's ridge line positions near Agok during the night . Most of C Company moved all the way to the 25th Division positions south of the Naktong . On September 1 that division reported that 110 men of C Company had come into its lines . + + = = = North Korean crossing = = = + + Meanwhile , 5 miles ( 8 @.@ 0 km ) north of Agok and A Company 's position , B Company , 9th Infantry , held a similar position on Hill 209 overlooking the Paekchin ferry crossing of the river . This ferry was located at the middle of the Naktong Bulge where the Yongsan road came down to the Naktong and crossed it . The US 2nd Infantry Division had planned a reconnaissance mission to start from there the night of August 31 , the same night that the NK I Corps offensive rolled across the river . + Near the end of the month two reconnaissance patrols from the 9th Infantry had crossed to the west side of the Naktong and observed North Korean tank and troop activity 2 miles ( 3 @.@ 2 km ) west of the river . Information obtained later indicated it was in fact the command post of the NK 9th Division . On August 25 , 9th Infantry commander Colonel John G. Hill outlined projected " Operation Manchu , " which was to be a company @-@ sized combat patrol to cross the river , advance to the suspected North Korean command post and communications center , destroy it , capture prisoners , and collect intelligence . + The 9th Infantry Regiment had planned Task Force Manchu on orders from the 2nd Division commander Major General Laurence B. Keiser , which in turn had received instructions from Eighth United States Army commander Lieutenant General Walton Walker for aggressive patrolling . Keiser decided the patrol should cross the river at the Paekchin ferry . The 9th Infantry reserve , E Company , reinforced with one section of light machine guns from H Company , was to be the attack force . The 1st Platoon , 2nd Engineer Combat Battalion , was to transport it across the river in assault boats the night of August 31 . Two heavy weapons companies , D and H , were each to furnish one section of heavy machine guns , one section of 81 @-@ mm. mortars , and one section of 75 @-@ mm. recoilless rifles for supporting fires . A platoon of 4 @.@ 2 @-@ inch mortars was also to give support . + After dark on August 31 , First Lieutenant Charles I. Caldwell of D Company and First Lieutenant Edward Schmitt of H Company , 9th Infantry , moved their men and weapons to the base of Hill 209 , which was within B Company 's defense sector and overlooked the Paekchin ferry crossing of the Naktong River . The raiding force , E Company , was still in its regimental reserve position about 2 miles ( 3 @.@ 2 km ) west of Yongsan , getting ready with the engineer platoon to move to the crossing site . Colonel Hill went forward in the evening with the 4 @.@ 2 @-@ inch mortar platoon to its position at the base of Hill 209 where the mortarmen prepared to set up their weapons . + By 21 : 00 , the closest front line unit was B Company on top of Hill 209 , 1 mile ( 1 @.@ 6 km ) north of the river road which curved around the hill 's southern base . The regimental chaplain , Captain Lewis B. Sheen , had gone forward in the afternoon to B Company to hold services . On top of Hill 209 , Chaplain Sheen and men in B Company after dark heard splashing in the water below them . They soon discovered a long line of North Korean soldiers wading the river . + The first North Korean crossing at the Paekchin ferry caught the Heavy Mortar Platoon unprepared in the act of setting up its weapons . It also caught most of the D and H Company men at the base of Hill 209 , .5 miles ( 0 @.@ 80 km ) from the crossing site . The North Koreans killed or captured many of the troops there . Hill was there , but escaped to the rear just before midnight , together with several others , when the division canceled Operation Manchu because of the attacks . The first heavy weapons carrying party was on its way up the hill when the North Korean attack engulfed the men below . It hurried on to the top where the advance group waited and there all hastily dug in on a small perimeter . This group was not attacked during the night . + From 21 : 30 until shortly after midnight the NK 9th Division crossed the Naktong at a number of places and climbed the hills quietly toward the 9th Infantry river line positions . Then , when the artillery barrage preparation lifted , the North Korean infantry were in position to launch their assaults . These began in the northern part of the regimental sector and quickly spread southward . At each crossing site the North Koreans would overwhelm local UN defenders before building pontoon bridges for their vehicles and armor . + At 02 : 00 , B Company was attacked . A truck stopped at the bottom of the hill , a whistle sounded , then came a shouted order , and North Korean soldiers started climbing the slope . The hills on both sides of B Company were already under attack as was also Hill 311 , a rugged terrain feature a 1 @.@ 5 miles ( 2 @.@ 4 km ) from the river and the North Koreans ' principal immediate objective . The North Koreans apparently were not aware of the Task Force Manchu group lower down on the hill and it was not attacked during the night . But higher up on Hill 209 the North Koreans drove B Company from its position , inflicting very heavy casualties on it . Sheen led one group of soldiers back to friendly lines on 4 September . + At 03 : 00 , 1 September , the 9th Infantry Regiment ordered its only reserve , E Company to move west along the Yongsan @-@ Naktong River road and take a blocking position at the pass between Cloverleaf Hill and Obong @-@ ni Ridge , 3 miles ( 4 @.@ 8 km ) from the river and 6 miles ( 9 @.@ 7 km ) from Yongsan . This was the critical terrain where so much heavy fighting had taken place in the first battle of the Naktong Bulge . Fighting began at the pass at 02 : 30 when an American medium tank of A Company , 72nd Tank Battalion , knocked out a T @-@ 34 at Tugok , also called Morisil . E Company never reached its blocking position . A strong North Korean force surprised and delivered heavy automatic fire on it at 03 : 30 from positions astride the road east of the pass . The company suffered heavy casualties , including the company commander and Keiser 's aide who had accompanied the force . With the critical parts of Cloverleaf Hill and Obong @-@ ni Ridge , the best defensive terrain between Yongsan and the river , the North Koreans controlled the high ground . The US 2nd Infantry Division now had to base its defense of Yongsan on relatively poor defensive terrain , the low hills at the western edge of the town . + + = = = US 23rd Infantry attacked = = = + + North of the 9th Infantry sector of the 2nd Infantry Division front along the Naktong , the US 23rd Infantry Regiment on August 29 had just relieved the 3rd Battalion of the US 38th Infantry Regiment , which in turn had only a few days before relieved the US 21st Infantry Regiment of the US 24th Infantry Division . On August 1 , the 23rd Regiment was in a new sector of which it had only a limited knowledge . It took over a 16 @,@ 000 yards ( 15 @,@ 000 m ) Naktong River front without its 3rd Battalion which had been attached to the US 1st Cavalry Division to the north . Colonel Paul L. Freeman , the regimental commander , deployed the 1st Battalion on the high ground along the river with the three companies abreast . The 1st Battalion , under US Lieutenant Colonel Claire E. Hutchin , Jr . , outposted the hills with platoons and squads . He placed the 2nd Battalion in a reserve position 8 miles ( 13 km ) behind the 1st Battalion and in a position where it commanded the road net in the regimental sector . On August 31h the 2nd Division moved E Company south to a reserve position in the 9th Infantry sector . + Two roads ran through the regimental sector from the Naktong River to Changnyong . The main road bent south along the east bank of the river to Pugong @-@ ni and then turned northeast to Changnyong . A northern secondary road curved around marshland and lakes , the largest of which was Lake U @-@ p 'o , to Changnyong . In effect , the 1st Battalion of the 23rd Regiment guarded these two approach routes to Changnyong . + The 42 men of the 2nd Platoon , B Company , 23rd Infantry held outpost positions on seven hills covering a 2 @,@ 600 yards ( 2 @,@ 400 m ) front along the east bank of the Naktong north of Pugong @-@ ni . Across the river in the rice paddies they could see , in the afternoon of August 31 , two large groups of North Korean soldiers . Occasionally artillery fire dispersed them . Just before dark , the platoon saw a column of North Koreans come out of the hills and proceed toward the river . They immediately reported to the battalion command post . The artillery forward observer , who estimated the column at 2 @,@ 000 people , thought they were refugees . Freeman immediately ordered the artillery to fire on the column , reducing its number . However the North Koreans continued their advance . + At 21 : 00 the first shells of what proved to be a two @-@ hour North Korean artillery and mortar preparation against the American river positions of 2nd Platoon . As the barrage rolled on , North Korean infantry crossed the river and climbed the hills in the darkness under cover of its fire . At 23 : 00 the barrage lifted and the North Koreans attacked 2nd Platoon , forcing it from the hill after a short fight . Similar assaults took place elsewhere along the battalion outpost line . + On the regimental left along the main Pugong @-@ ni @-@ Changnyong road North Korean soldiers completely overran C Company by 0300 September 1 . Only seven men of C Company could be accounted for , and three days later , after all the stragglers and those cut off behind North Korean lines had come in , there were only 20 men in the company . As the North Korean attack developed during the night , 1st Battalion succeeded in withdrawing a large part of its force , less C Company , just north of Lake U @-@ p 'o and the hills there covering the northern road into Changnyong , 3 miles ( 4 @.@ 8 km ) east of the river and 5 miles ( 8 @.@ 0 km ) west of the town . B Company lost heavily in this action . + When word of the disaster that had overtaken 1st Battalion reached regimental headquarters , Freeman obtained the release of G and F Companies from 2nd Division reserve and sent the former to help 1st Battalion and the latter on the southern road toward Pugong @-@ ni and C Company . Major Lloyd K. Jenson , executive officer of the 2nd Battalion , accompanied F Company down the Pugong @-@ ni road . This force was unable to reach C Company , but Jenson collected stragglers from it and seized high ground astride this main approach to Changnyong near Ponch 'o @-@ ri above Lake Sanorho , and went into a defensive position there . The US 2nd Division released E Company to the regiment and the next day it joined F Company to build up what became the main defensive position of the 23d Regiment in front of Changnyong . North Korean troops during the night passed around the right flank of 1st Battalion 's northern blocking position and reached the road three miles behind him near the division artillery positions . The 23rd Infantry Headquarters and Service Companies and other miscellaneous regimental units finally stopped this penetration near the regimental command post 5 miles ( 8 @.@ 0 km ) northwest of Changnyong . + + = = = US 2nd Division split = = = + + Before the morning of 1 September had passed , reports coming in to US 2nd Division headquarters made it clear that North Koreans had penetrated to the north @-@ south Changnyong @-@ Yongsan road and cut the division in two ; the 38th and 23d Infantry Regiments with the bulk of the division artillery in the north were separated from the division headquarters and the 9th Infantry Regiment in the south . Keiser decided that this situation made it advisable to control and direct the divided division as two special forces . Accordingly , he placed the division artillery commander , Brigadier General Loyal M. Haynes , in command of the northern group . Haynes ' command post was 7 miles ( 11 km ) north of Changnyong . Task Force Haynes became operational at 10 : 20 , September 1 . Southward , in the Yongsan area , Keiser placed Brigadier General Joseph S. Bradley , Assistant Division Commander , in charge of the 9th Infantry Regiment , the 2nd Engineer Combat Battalion , most of the 72nd Tank Battalion , and other miscellaneous units of the division . This southern grouping was known as Task Force Bradley . + All three regiments of the NK 2nd Division @-@ the 4th , 17th , and 6th , in line from north to south @-@ crossed during the night to the east side of the Naktong River into the 23rd Regiment sector . The NK 2nd Division , concentrated in the Sinban @-@ ni area west of the river , had , in effect , attacked straight east across the river and was trying to seize the two avenues of advance into Changnyong above and below Lake U @-@ p 'o . On August 31 , 1950 , Lake U @-@ p 'o was a large body of water although in most places very shallow . + At dawn September 1 , Keiser at 2nd Division headquarters in Muan @-@ ni , 7 miles ( 11 km ) east of Yongsan on the Miryang road , felt his division was in the midst of a crisis . The massive North Korean attack had made deep penetrations everywhere in the division sector except in the north in the zone of the 38th Infantry . The NK 9th Division had effected major crossings of the Naktong at two principal points against the US 9th Infantry ; the NK 2nd Division in the meantime had made three major crossings against the US 23rd Infantry ; and the NK 10th Division had begun crossing more troops in the Hill 409 area near Hyongp 'ung in the US 38th Infantry sector . At 08 : 10 Keiser telephoned Eighth Army headquarters and reported the heaviest and deepest North Korean penetrations were in the 9th Infantry sector . + Liaison planes rose from the division strip every hour to observe the North Korean progress and to locate US 2nd Infantry Division front @-@ line units . Communication from division and regimental headquarters to nearly all the forward units was broken . Beginning at 09 : 30 and continuing throughout the rest of the day , the light aviation section of the division artillery located front @-@ line units cut off by the North Koreans , and made fourteen airdrops of ammunition , food , water , and medical supplies . As information slowly built up at division headquarters it became apparent that the North Koreans had punched a hole 6 miles ( 9 @.@ 7 km ) wide and 8 miles ( 13 km ) deep in the middle of the division line and made less severe penetrations elsewhere . The front @-@ line battalions of the US 9th and 23rd Regiments were in various states of disorganization and some companies had virtually disappeared . Keiser hoped he could organize a defense along the Changnyong @-@ Yongsan road east of the Naktong River , and prevent North Korean access to the passes eastward leading to Miryang and Ch 'ongdo . + + = = = Reinforcements = = = + + At 09 : 00 Walker requested the US Air Force to make a maximum effort along the Naktong River from Toksong @-@ dong , just above the US 2nd Division boundary , southward and to a depth of 15 miles ( 24 km ) west of the river . He wanted the Air Force to isolate the battlefield and prevent further North Korean reinforcements and supplies from moving across the river in support of the North Korean spearhead units . The Far East Command requested the US Navy to join in the air effort , and the US Seventh Fleet turned back from its strikes in the Inch 'on @-@ Seoul area and sped southward at full steam toward the southern battle front . Walker came to the US 2nd Division front at 12 : 00 and ordered the division to hold at all costs . He had already ordered ground reinforcements to the Yongsan area . + During the morning of 1 September , Walker weighed the news coming in from his southern front , wavering in a decision as to which part of the front most needed his Pusan Perimeter reserves . Since midnight the NK I Corps had broken his Pusan Perimeter in two places @-@ the NK 2nd and 9th Divisions in the US 2nd Division sector , and the NK 7th Division and NK 6th Division in the US 25th Division sector , below the junction of the Nam and Naktong Rivers . In the US 2nd Division sector North Korean troops were at the edge of Yongsan , the gateway to the corridor leading 12 miles ( 19 km ) eastward to Miryang and the main Pusan @-@ Mukden railroad and highway . + Eighth Army had in reserve three understrength infantry regiments and the 2 @-@ battalion British 27th Infantry Brigade which was not yet completely equipped and ready to be placed in line : The 1st Provisional Marine Brigade at Changwon , 6 miles ( 9 @.@ 7 km ) northeast of Masan , preparing for movement to the port of Pusan ; the US 27th Infantry Regiment of the 25th Division which had arrived at Masan only the night before at 20 : 30 to relieve the 5th Regimental Combat Team , which was then to join the 24th Division in the Taegu area ; and the US 19th Infantry Regiment of the US 24th Infantry Division , then with that division 's headquarters at Kyongsan southeast of Taegu . Walker alerted both the 24th Division headquarters , together with its 19th Regiment , and the 1st Provisional Marine Brigade to move at a moment 's notice ; the 24th Division either to the 2nd or 25th Division fronts , and the marines to an unannounced destination . + As the morning passed , General Walker decided that the situation was most critical in the Naktong Bulge area of the US 2nd Division sector . There the North Koreans threatened Miryang and with it the entire Eighth Army position . At 11 : 00 Walker ordered US Marine Corps Brigadier General Edward A. Craig , commanding the 1st Provisional Marine Brigade , to prepare the marines to move at once . The marines made ready to depart for the Naktong Bulge at 13 : 30 . + + = = = North Korean advance = = = + + The situation on the front was chaotic during the day September 1 . The North Koreans at one place had crossed at the Kihang ferry , captured Agok , and scattered A Company , 9th Infantry at its positions from Agok northward . A Company withdrew to positions on the ridge line back of the river . From there at daylight the men could see North Korean soldiers on many of the ridges surrounding them , most of them moving east . After several hours , 2nd Platoon of A Company sent a patrol down the hill to Agok to obtain supplies abandoned there during the night , returning later with much needed water , rations , and ammunition . + Later in the morning North Korean barges crossed the Naktong below A Company . The company sent a squad with a light machine gun to the southern tip of the ridge overlooking Agok to take these troops under fire . When the squad reached the tip of the ridge they saw that a North Korean force occupied houses at its base . The company hit these houses with artillery . The North Koreans broke from the houses , running for the river . At this the light machine gun at the tip of the ridge took them under fire , as did another across the Naktong to the south in the US 25th Infantry Division sector . Proximity fuze artillery fire decimated this group . Combined fire from all weapons inflicted an estimated 300 casualties on this North Korean force . In the afternoon , US aircraft dropped food and ammunition to the company ; only part of it was recovered . The 1st Battalion ordered A Company to withdraw the company that night . + During the withdraw , however , A Company ran into a sizable North Korean force and had scattered in the ensuing fight . Most of the company , including its commander were killed at close range . In this desperate action , Private First Class Luther H. Story , a weapons squad leader , fought so tenaciously that he was awarded the Medal of Honor . Badly wounded , Story refused to be a burden to those who might escape , and when last seen was still engaging North Korean at close range . Of those in the company , approximately ten men escaped to friendly lines . The next morning , under heavy fog , the group made its way by compass toward Yongsan . From a hill at 12 : 00 , after the fog had lifted , the men looked down on the Battle of Yongsan which was then in progress . That afternoon 20 survivors of the company merged into the lines of the 72nd Tank Battalion near Yongsan . Stragglers from this position continued to stream in the next few days as well . + + = = = The end of Task Force Manchu = = = + + In the meantime , Task Force Manchu was still holding its position along the Naktong River , about 5 miles ( 8 @.@ 0 km ) north of where A Company had been destroyed on the southern end of the line . The perimeter position taken by the men of D and H Companies , 9th Infantry , who had started up the hill before the North Koreans struck , was on a southern knob of Hill 209 , 0 @.@ 5 miles ( 0 @.@ 80 km ) south of B Company 's higher position . In addition to the D and H Company men , there were a few from the Heavy Mortar Platoon and one or two from B Company . Altogether , 60 to 70 men were in the group . The group had an SCR @-@ 300 radio , a heavy machine gun , two light machine gun , a M1918 Browning Automatic Rifle , about 20 M1 Garand rifles , and about 40 carbines or pistols . Schmitt assumed command of the group . + During the night Schmitt established radio communication with the 1st Battalion , 9th infantry . When daylight came Schmitt and his group saw that they were surrounded by North Koreans . One force occupied the higher knob half a mile above them , formerly held by B Company . Below them , North Koreans continued crossing the river and moving supplies forward to their combat units , some of them already several miles eastward . The North Koreans quickly discovered Task Force Manchu group . They first attacked it at 14 : 00 that afternoon , and were repulsed . That night an estimated company attacked three times , pressing the fight to close quarters , but failed each time to penetrate the tight US perimeter . Daylight of the second day disclosed many North Korean dead on the steep slopes outside the perimeter . + In the afternoon of September 2 Schmitt radioed 1st Battalion for an airdrop of supplies . A US plane attempted the drop , but the perimeter was so small and the slopes so steep that virtually all the supplies went into North Korean hands . The men in the perimeter did , however , recover from a drop made later at 19 : 00 some supplies and ammunition . Private First Class Joseph R. Ouellette , of H Company , left the perimeter to gather weapons , ammunition , and grenades from the North Korean dead . On several occasions he was attacked , and on one such occasion a North Korean soldier suddenly attacked Ouellette , who killed the North Korean in hand @-@ to @-@ hand combat . + That same afternoon , the North Koreans sent an American prisoner up the hill to Schmitt with the message , " You have one hour to surrender or be blown to pieces . " Failing in frontal infantry attack to reduce the little defending force , the North Koreans now meant to take it under mortar fire . Only 45 minutes later North Korean antitank fire came in on the knob and two machine guns from positions northward and higher on the slope of Hill 209 swept the perimeter . Soon , mortars emplaced on a neighboring high finger ridge eastward registered on Schmitt 's perimeter and continued firing until dark . The machine gun fire forced every man to stay in his foxhole . The lifting of the mortar fire after dark was the signal for renewed North Korean infantry attacks , all of which were repulsed . But the number of killed and wounded within the perimeter was growing , and supplies were diminishing . There were no medical supplies except those carried by one aid man . + The third day , September 3 , the situation worsened . The weather was hot and ammunition , food and supplies were nearly completely exhausted . Since the previous afternoon , North Korean mortar barrages had alternated with infantry assaults against the perimeter . Survivors later estimated there were about twenty separate infantry attacks repulsed . Two North Korean machine guns still swept the perimeter whenever anyone showed himself . Dead and dying US troops were in almost every foxhole . Mortar fragments destroyed the radio and this ended all communication with other US units . Artillery fire and air strikes requested by Schmitt never came . Some North Koreans worked their way close to the perimeter and threw grenades into it . Six times Ouellette leaped from his foxhole to escape grenades thrown into it . In this close action Ouellette was killed . Most of the foxholes of the perimeter received one or more direct mortar hits in the course of the continuing mortar fire . One of these killed Schmitt on September 3 . The command passed now to First Lieutenant Raymond J. McDoniel of D Company , senior surviving officer . + At daylight on the morning of 4 September only two officers and approximately half the men who had assembled on the hill , were alive . As the day passed , with ammunition down to about one clip per man and only a few grenades left and no help in sight , McDoniel decided to abandon the position that night . When it got dark the survivors would split into small groups and try to get back to friendly lines . That evening after dark the North Koreans launched another weak attack against the position . At 22 : 00 , McDoniel and Caldwell and 27 enlisted men slipped off the hill in groups of four . Master Sergeant Travis E. Watkins , still alive in his paralyzed condition , refused efforts of evacuation , saying that he did not want to be a burden to those who had a chance to get away . He asked only that his carbine be loaded and placed on his chest with the muzzle under his chin . Like Oullette , he would also win the Medal of Honor for his actions . Of the 29 men who came off the hill the night of September 4 , 22 escaped to friendly lines , many of them following the Naktong downstream , hiding by day and traveling by night , until they reached the lines of the US 25th Infantry Division . + Members of Task Force Manchu who escaped from Hill 209 brought back considerable intelligence information of North Korean activity in the vicinity of the Paekchin ferry crossing site . At the ferry site the North Koreans had put in an underwater bridge . A short distance downstream , each night they placed a pontoon bridge across the river and took it up before dawn the next morning . Carrying parties of 50 civilians guarded by four North Korean soldiers crossed the river continuously at night , an estimated total of 800 @-@ 1 @,@ 000 carriers being used at this crossing site . + + = = = Changyong = = = + + North of the US 9th Infantry and the battles in the Naktong Bulge and around Yongsan , the US 23d Infantry Regiment after daylight of September 1 was in a very precarious position . Its 1st Battalion had been driven from the river positions and isolated 3 miles ( 4 @.@ 8 km ) westward . Approximately 400 North Koreans now overran the regimental command post , compelling Freeman to withdraw it about 600 yards ( 550 m ) . There , 5 miles ( 8 @.@ 0 km ) northwest of Changnyong , the US 23rd Infantry Headquarters and Headquarters Company , miscellaneous regimental units , and regimental staff officers checked the North Koreans in a 3 @-@ hour fight . + The North Koreans advanced to Changnyong itself during the afternoon of September 2 , and ROK National Police withdrew from the town . North Koreans were in Changnyong that evening . With his communications broken southward to the 2nd Infantry Division headquarters and the 9th Infantry , Haynes during the day decided to send a tank patrol down the Yongsan road in an effort to re @-@ establish communication . C Company , 72nd Tank Battalion , led its tanks southward . They had to fight their way down the road through several roadblocks . Of the three tanks that started , only the lead tank got through to Yongsan . There , it delivered an overlay of Task Force Haynes ' positions to Bradley . + Still farther northward in the zone of the US 38th Infantry the North Koreans were also active . After the North Korean breakthrough during the night of August 31 , Keiser had ordered the 2nd Battalion , 38th Infantry , to move south and help the 23rd Infantry establish a defensive position west of Changnyong . In attempting to do this , the battalion found North Korean troops already on the ridges along the road . They had penetrated to Hill 284 overlooking the 38th Infantry command post . This hill and Hill 209 dominated the rear areas of the regiment . At 06 : 00 September 3 , 300 North Koreans launched an attack from Hill 284 against the 38th Regiment command post . The regimental commander organized a defensive perimeter and requested a bombing strike which was denied him because the enemy target and his defense perimeter were too close to each other . But the Air Force did deliver rocket and strafing strikes . + This fight continued until September 5 . On that day F Company captured Hill 284 killing 150 North Koreans . From the crest he and his men watched as many more North Koreans ran into a village below them . Directed artillery fire destroyed the village . Among the abandoned North Korean materiel on the hill , Schauer 's men found twenty @-@ five American BARs and submachine guns , a large American radio , thirty boxes of unopened American fragmentation and concussion grenades , and some American rations . + + = = = 1 @-@ 23rd Infantry isolated = = = + + Meanwhile , during these actions in its rear , the 1st Battalion , 23rd Infantry , was cut off 3 miles ( 4 @.@ 8 km ) west of the nearest friendly units . On September 1 the regiment ordered it to withdraw to the Changnyong area . At 14 : 00 a tank @-@ infantry patrol was sent down the road , but it reported that an estimated North Korean battalion held the mountain pass just eastward of the battalion 's defense perimeter . Upon receiving this report the battalion commander requested permission by radio to remain in his present position and try to obstruct the movement of North Korean reinforcements and supplies . That evening Freeman approved this request , and 1st Battalion spent three days in the isolated positions . During this time C @-@ 47 Skytrain planes supplied the battalion by airdrops . + On the morning of September 1 , 3rd Battalion , 38th Infantry moved in an attack westward from the 23rd Regiment command post near Mosan @-@ ni to open the road to the 1st Battalion . On the second day of the fighting at the pass , the relief force broke through the roadblock with the help of air strikes and artillery and tank fire . The advanced elements of the battalion joined 1st Battalion at 17 : 00 September 2 . That evening , North Koreans strongly attacked the 3rd Battalion , 38th Infantry , on Hill 209 north of the road and opposite 1st Battalion , driving one company from its position . + On September 4 , Haynes changed the boundary between the 38th and 23rd Infantry Regiments , giving the northern part of the 23rd 's sector to the 38th Infantry , thus releasing 1st Battalion for movement southward to help the 2nd Battalion defend the southern approach to Changnyong . The 1st Battalion , 23rd Infantry , about 1 @,@ 100 men strong when the attack began , was now down to a strength of approximately 600 men . The 23rd Infantry now made plans to concentrate all its troops on the position held by its 2nd Battalion on the Pugong @-@ ni @-@ Changnyong road . The 1st Battalion moved there and took a place on the left flank of the 2nd Battalion . At the same time the regimental command post moved to the rear of this position . In this regimental perimeter , the 23rd Infantry fought a series of hard battles . Simultaneously it had to send combat patrols to its rear to clear infiltrating North Koreans from Changnyong and from its supply road . + + = = = Battle of Yongsan = = = + + On the morning of September 1 the 1st and 2nd Regiments of the NK 9th Division , in their first offensive of the war , stood only a few miles short of Yongsan after a successful river crossing and penetration of the American line . The 3rd Regiment had been left at Inch 'on , but division commander Major General Pak Kyo Sam felt the chances of capturing Yongsan were strong . + On the morning of September 1 , with only the shattered remnants of E Company at hand , the US 9th Infantry Regiment , US 2nd Infantry Division had virtually no troops to defend Yongsan . Keiser in this emergency attached the 2nd Engineer Combat Battalion to the regiment . The US 72nd Tank Battalion and the 2nd Division Reconnaissance Company also were assigned positions close to Yongsan . The regimental commander planned to place the engineers on the chain of low hills that arched around Yongsan on the northwest . + A Company , 2nd Engineer Combat Battalion , moved to the south side of the Yongsan @-@ Naktong River road ; D Company of the 2nd Engineer Battalion was on the north side of the road . Approximately 2 miles ( 3 @.@ 2 km ) west of Yongsan an estimated 300 North Korean troops engaged A Company in a fire fight . M19 Gun Motor Carriages of the 82nd AAA Battalion supported the engineers in this action , which lasted several hours . Meanwhile , with the approval of General Bradley , D Company moved to the hill immediately south of and overlooking Yongsan . A platoon of infantry went into position behind it . A Company was now ordered to fall back to the southeast edge of Yongsan on the left flank of D Company . There , A Company went into position along the road ; on its left was C Company of the Engineer battalion , and beyond C Company was the 2nd Division Reconnaissance Company . The hill occupied by D Company was in reality the western tip of a large mountain mass that lay southeast of the town . The road to Miryang came south out of Yongsan , bent around the western tip of this mountain , and then ran eastward along its southern base . In its position , D Company not only commanded the town but also its exit , the road to Miryang . + North Koreans had also approached Yongsan from the south . The US 2nd Division Reconnaissance Company and tanks of the 72nd Tank Battalion opposed them in a sharp fight . In this action , Sergeant First Class Charles W. Turner of the Reconnaissance Company particularly distinguished himself . He mounted a tank , operated its exposed turret machine gun , and directed tank fire which reportedly destroyed seven North Korean machine guns . Turner and this tank came under heavy North Korean fire which shot away the tank 's periscope and antennae and scored more than 50 hits on it . Turner , although wounded , remained on the tank until he was killed . That night North Korean soldiers crossed the low ground around Yongsan and entered the town from the south . + At 09 : 35 September 2 , while the North Koreans were attempting to destroy the engineer troops at the southern edge of Yongsan and clear the road to Miryang , Walker spoke by telephone with Major General Doyle O. Hickey , Deputy Chief of Staff , Far East Command in Tokyo . He described the situation around the Perimeter and said the most serious threat was along the boundary between the US 2nd and US 25th Infantry Divisions . He described the location of his reserve forces and his plans for using them . He said he had started the 1st Provisional Marine Brigade toward Yongsan but had not yet released them for commitment there and he wanted to be sure that General of the Army Douglas MacArthur approved his use of them , since he knew that this would interfere with other plans of the Far East Command . Walker said he did not think he could restore the 2nd Division lines without using them . Hickey replied that MacArthur had the day before approved the use of the US Marines if and when Walker considered it necessary . A few hours after this conversation Walker , at 13 : 15 , attached the 1st Provisional Marine Brigade to the US 2nd Division and ordered a co @-@ ordinated attack by all available elements of the division and the marines , with the mission of destroying the North Koreans east of the Naktong River in the 2nd Division sector and of restoring the river line . The marines were to be released from 2nd Division control as soon as this mission was accomplished . + A decision was reached that the marines would attack west at 08 : 00 on September 3 astride the Yongsan @-@ Naktong River road ; the 9th Infantry , B Company of the 72nd Tank Battalion , and D Battery of the 82d AAA Battalion would attack northwest above the marines and attempt to re @-@ establish contact with the US 23rd Infantry ; the 2nd Engineer Combat Battalion , remnants of the 1st Battalion , 9th Infantry , and elements of the 72nd Tank Battalion would attack on the left flank , or south , of the marines to reestablish contact with the 25th Division . Eighth Army now ordered the US 24th Infantry Division headquarters and the US 19th Infantry Regiment to move to the Susan @-@ ni area , 8 miles ( 13 km ) south of Miryang and 15 miles ( 24 km ) east of the confluence of the Nam River and the Naktong River . There it was to prepare to enter the battle in either the 2nd or 25th Division zone . + The American counteroffensive of September 3 – 5 west of Yongsan , according to prisoner statements , resulted in one of the bloodiest debacles of the war for a North Korean division . Even though remnants of the NK 9th Division , supported by the low strength NK 4th Division , still held Obong @-@ ni Ridge , Cloverleaf Hill , and the intervening ground back to the Naktong on September 6 , the division 's offensive strength had been spent at the end of the American counterattack . The NK 9th and 4th divisions were not able to resume the offensive . + + = = = NK 2nd Division destroyed = = = + + The NK 2nd Division made a new effort against the 23rd Infantry 's perimeter in the predawn hours of September 8 , in an attempt to break through eastward . This attack , launched at 02 : 30 and heavily supported with artillery , penetrated F Company . It was apparent that unless F Company 's position could be restored the entire regimental front would collapse . When all its officers became casualties , First Lieutenant Ralph R. Robinson , adjutant of the 2nd Battalion , assumed command of the company . With North Koreans rapidly infiltrating his company 's position and gaining its rear , Robinson in the darkness made his way through them 500 yards ( 460 m ) to A Company 's position . There he obtained that company 's reserve platoon and brought it back to F Company . He accomplished the dangerous and difficult task of maneuvering it into the gap in F Company 's lines in darkness and heavy rain . + The attack tapered off with the coming of daylight , but that night it resumed . The North Koreans struck repeatedly at the defense line . This time they continued the fighting into the daylight hours of 9 September . The Air Force then concentrated strong air support over the regimental perimeter to aid the ground troops . Casualties came to the aid stations from the infantry companies in an almost steady stream during the morning . All available men from Headquarters Company and special units were formed into squads and put into the fight at the most critical points . At one time , the regimental reserve was down to six men . When the attack finally ceased shortly after 12 : 00 the 23rd Regiment had an estimated combat efficiency of only 38 percent . + This heavy night and day battle cost the NK 2nd Division most of its remaining offensive strength . The medical officer of the NK 17th Regiment , 2nd Division , captured a few days later , said that the division evacuated about 300 men nightly to a hospital in Pugong @-@ ni , and that in the first two weeks of September the 2nd Division lost 1 @,@ 300 killed and 2 @,@ 500 wounded in the fighting west of Changnyong . Even though its offensive strength was largely spent by September 9 , the division continued to harass rear areas around Changnyong with infiltrating groups as large as companies . Patrols daily had to open the main supply road and clear the town . + North Korean and US troops remained locked in combat along the Naktong River for several more days . The North Koreans ' offensive capability was largely destroyed , and the US troops resolved to hold their lines barring further attack . + + = = = North Korean withdrawal = = = + + The UN counterattack at Inchon collapsed the North Korean line and cut off all their main supply and reinforcement routes . On September 19 the UN discovered the North Koreans had abandoned much of the Pusan Perimeter during the night , and the UN units began advancing out of their defensive positions and occupying them . Most of the North Korean units began conducting delaying actions attempting to get as much of their army as possible into North Korea . The North Koreans withdrew from the Masan area first , the night of September 18 – 19 . After the forces there , the remainder of the North Korean armies withdrew rapidly to the North . The US units rapidly pursued them north , passing over the Naktong River positions , which were no longer of strategic importance . + + = = Aftermath = = + + The North Korean 2nd and 9th Divisions were almost completely destroyed in the battles . The 9th Division had numbered 9 @,@ 350 men at the beginning of the offensive on September 1 . The 2nd Division numbered 6 @,@ 000 . Only a few hundred from each division returned to North Korea after the fight . The majority of the North Korean troops had been killed , captured or deserted . All of NK II Corps was in a similar state , and the North Korean army , exhausted at Pusan Perimeter and cut off after Inchon , was on the brink of defeat . + By this time , the US 2nd Infantry Division suffered 1 @,@ 120 killed , 2 @,@ 563 wounded , 67 captured and 69 missing during its time at Pusan Perimeter . This included about 180 casualties it suffered during the First Battle of Naktong Bulge the previous month . American forces were continually repulsed but able to prevent the North Koreans from breaking the Pusan Perimeter . The division had numbered 17 @,@ 498 on September 1 , but was in excellent position to attack despite its casualties . The 1st Provisional Marine Brigade suffered 185 killed and around 500 wounded during the Battle of Pusan Perimeter , most of which probably occurred at Yongsan . + Of all the North Korean attacks along the Pusan Perimeter , the Second Battle of Naktong Bulge is seen by historians as the most serious threat . It was the battle in which the North Koreans made the most substantial gains , splitting the US 2nd Infantry Division in half and briefly capturing Yongsan , where they were very close to breaching through to the US forces ' supply lines and threatening other divisions ' rear areas . However , once again the fatal weakness of the North Korean Army had cost it victory after an impressive initial success — its communications and supply were not capable of exploiting a breakthrough and of supporting a continuing attack in the face of massive air , armor , and artillery fire that could be concentrated against its troops at critical points . By September 8 , the North Korean attacks in the area had been repulsed . + + + = Hed PE = + + Hed PE , also known as ( hed ) Planet Earth and stylized as ( həd ) p.e. , is an American rock band from Huntington Beach , California . Formed in 1994 , the band performs a style of music which it refers to as " G @-@ punk " , a fusion of punk rock and gangsta rap . + After releasing three albums on Jive Records , Hed PE left the label to record independently , eventually signing with Suburban Noize Records in 2006 . Since 2006 , the band has become known for its involvement in the 9 / 11 Truth movement , referencing it in many of their song lyrics and concerts , as well as the concept of the album New World Orphans . To date , they have released nine studio albums , one live album and two compilation albums . + + = = History = = + + + = = = Formation and major @-@ label debut ( 1994 – 1999 ) = = = + + The band was formed by vocalist Jared Gomes , formerly of The Clue , also known as " M.C.U.D. " ( MC Underdog ) , and guitarist Wes Geer , who became friends amidst the Orange County hardcore punk scene . Gomes and Geer recruited guitarist Chizad , bassist Mawk , drummer B.C. Vaught and DJ Product © 1969 . They named the group " Hed " , which stands for " higher education " . The band built a following with their energetic performances at local venues , and released the self @-@ financed extended play , Church of Realities . Legal issues forced Hed to change their name , adding " PE " , which stood for " Planet Earth " . + Hed PE signed with Jive Records , releasing their self @-@ titled debut album in 1997 . In his review of the album , Allmusic 's Steve Huey wrote " There are some slow and / or unfocused moments [ ... ] but overall , its aggression will probably play well with late- ' 90s metal and punk fans . " Due to the label 's contractual terms and the disappointing sales of the album , the band found themselves unable to repay the cash advances given to them by Jive . Gomes is quoted as saying " We had these romantic visions of the music industry , and we thought it would be cool to be a punk band on a rap label . So we fulfilled that dream , but it was also probably the worst thing that could have happened . [ ... ] We 've had offers from Sony and others that we can 't take because we owe Jive so much money . " + + = = = Broke and Blackout ( 2000 – 2004 ) = = = + + On June 6 , 2000 , Hed PE appeared on the tribute album Nativity in Black II , covering Black Sabbath 's " Sabbra Cadabra " . Hed PE released their second studio album , Broke on August 22 , 2000 . It peaked at No. 63 on the Billboard 200 , while its first single , " Bartender " , peaked at No. 23 on the Billboard Mainstream Rock Tracks chart and at No. 27 on the Modern Rock Tracks chart . Allmusic 's Jason D. Taylor wrote : " Broke may have not found as much success in the competitive mainstream market as some would have liked , and even despite its distinct departure from the group 's debut , it is an album that shows more vision than other rap @-@ tinged rock albums to come out in 2000 . " The most negative response to the album came from critics who viewed its lyrics as misogynistic . + On October 27 , 2000 , Gomes was arrested for possession of marijuana while the band was performing in Waterbury , Connecticut . He was released on a US $ 1 @,@ 500 bond . In 2001 , Hed PE performed on the Ozzfest tour alongside bands such as Korn , Static @-@ X , and System of a Down . A music video for " Killing Time " , the second single from Broke , was produced in promotion of the film 3000 Miles to Graceland , which featured the song on its soundtrack . + Hed PE released their third studio album , Blackout , on March 18 , 2003 . It peaked at No. 33 on the Billboard 200 , while its title track peaked at No. 21 on the Mainstream Rock Tracks chart and at No. 32 on the Modern Rock Tracks chart . Allmusic 's Johnny Loftus wrote that " While it expands on melodic elements that had previously played a supporting role in the band 's sound , Blackout also delivers truckloads of crushing guitar and pounding rhythm . And whether or not it is the presence of a top @-@ line producer , ( hed ) pe have figured out a way to imbue their aggressive mix of heavy rock and hip @-@ hop with some serious hooks . " Guitarist Jaxon joined the band in early 2004 . He is the fourth person to fill this position . + + = = = Only in Amerika ( 2004 ) = = = + + Hed PE left Jive Records , releasing their fourth studio album , Only in Amerika , on Koch Records on October 19 , 2004 . It peaked at No. 20 on the Top Independent Albums chart and at No. 186 on the Billboard 200 . In his review of the album , Johnny Loftus wrote " It wants to be a confrontational megaphone in the ear of conservatives , but Jahred 's torrential rhetoric is too messy and blatantly offensive to incite anything but superficial anger , and the music -- though occasionally explosive -- takes a backseat to the ranting . " + + = = = Suburban Noize Records and New Album Evolution ( 2006 – 2015 ) = = = + + In 2006 , Hed PE signed with Suburban Noize Records , recording their fifth studio album , Back 2 Base X. The album was intended as a return to the basics of rock music , and did not rely as heavily on studio enhancement as previous releases . The album was released on June 6 , 2006 , the same day as The Best of ( həd ) Planet Earth , a compilation album produced by Jive Records without the band 's authorization or consent . Back 2 Base X peaked at No. 12 on the Independent Albums chart , and at No. 154 on the Billboard 200 . Allmusic 's Rob Theakston wrote that " Back 2 Base X suffers from the same problems as Amerika : it tries to be conceptual in thought à la Tool and vicious in its political commentary à la Fugazi or System of a Down , but somehow falls short by sounding like an angry stoner on a soapbox . It won 't win any new fans , but existing fans of ( hed ) pe 's work won 't be turning their backs away from the band in anger anytime soon , either . " + On June 26 , 2007 , the band released their sixth studio album , Insomnia . It peaked at No. 16 on the Independent Albums chart , and at No. 138 on the Billboard 200 . The album 's lead single , " Suffa " , became one of the most requested tracks at Sirius Satellite Radio 's Hard Attack , while the song 's music video was voted one of the Top 10 of 2007 on MTV 's Headbangers Ball . Hed PE released their first live album , The D.I.Y. Guys , in 2008 . On January 13 , 2009 , they released their seventh studio album , New World Orphans . It was released in three different versions ; each contains a different set of bonus tracks . In 2009 , drummer Trauma joined the band . He is the sixth person to fill this position . The band 's eighth studio album , Truth Rising , was released on October 26 , 2010 to mixed reviews . Hed pe played the " Local Heroes Tour " in the fall of 2012 and played with Flipsyde in San jose on Sunday October 7 , 2012 . In an interview , frontman Jared Gomes stated that their album for 2013 titled Ascension would be released within the first half of 2014 . Towards the end of 2013 , DJ Product mysteriously left the band with no explanation and no comment from the other members . On 1 / 1 / 2014 , Frontman Jahred Gomes stated on the band 's official Facebook that the new upcoming ( hed ) PE album will be named " Evolution " and to be released within the year . + On May 13 , 2014 , On the band 's official Facebook page , they released the official announcement of when the band 's new album Evolution will hit stores . The album is set for release July 22 , 2014 . They also released a teaser of the tone of the new album on their Facebook page and soon after , the track " One More Body " . + In 2015 , it was confirmed that 12 @-@ year guitarist Jaxon Benge and original bassist Mark Young had left the band . They were replaced by guitarist Greg " Gregzilla " Harrison and bassist Kurt Blankenship , leaving vocalist and founding member Jared Gomes as the group 's only remaining original talent . + + = = Style = = + + + = = = Music and lyrics = = = + + Hed PE performs a style of music which they have referred to as " G @-@ punk " , a phrase inspired by the term " G @-@ funk " , itself a reference to the P @-@ Funk collective . Hed PE 's music is a fusion of styles ranging from hip hop , reggae , and ska to hard rock , punk , and heavy metal . Other elements that have been incorporated into this style include blues , funk , jazz and industrial . Jared Gomes ' vocal style ranges from melodic singing to rapping , screaming , and death growls . The band 's lyrics draw from a number of subjects , including the existence of extraterrestrial life , criticism of organized religion , the 9 / 11 Truth movement , cannabis use and sexual intercourse . + Gomes , in addition to the 9 / 11 Truth movement , has expressed support for social liberal politicians such as Nancy Pelosi and president Barack Obama . Previously however , Gomes ' 2004 lyrics for Only in Amerika expressed support for nationalism , and called for retaliation against Al Qaeda for the 9 / 11 terrorist attacks . + + = = = Influences = = = + + The band 's influences include HEM , Beastie Boys , Black Sabbath , Bob Marley , Led Zeppelin , Nine Inch Nails , Snoop Dogg , Cypress Hill , Notorious B.I.G. and Rage Against the Machine . Hed PE 's second album , Broke , incorporated classic rock and world music influences , while Back 2 Base X was influenced by classic punk bands such as the Sex Pistols and the Clash , Insomnia was influenced by thrash metal bands such as Slayer , and New World Orphans was influenced by Suicidal Tendencies and Minor Threat . Guitarist Jaxon has been credited for encouraging a heavier , hardcore punk @-@ influenced musical style . + + = = Band members = = + + Jared ( Paulo Sergio Gomes ) — lead vocals ( 1994 – current ) + Major Trauma ( Jeremiah Stratton ) — drums ( 2008 – current ) + Gregzilla ( Greg Harrison ) — guitar ( 2015 – current ) + Kid Bass ( Kurt Blankenship ) — bass ( 2015 – current ) + + = = = Former members = = = + + Ken Sachs ( The Finger ) — keyboard ( 1994 – 1996 ) + Chad Benekos ( Chizad ) — guitar ( 1994 – 2002 ) + Wesley Geer ( Wesstyle , Wes Geer ) — guitar ( 1994 – 2003 ) + Ben C. Vaught ( B.C. ) — drums ( 1994 – 2003 ) + Doug Boyce ( DJ Product © 1969 ) — turntables , samples ( 1994 – 2013 ) + Mark Young ( Mawk ) — bass ( 1994 – 2015 ) + Sonny Mayo — guitar ( 2002 – 2003 ) + Jackson Benge ( Jaxon ) — guitar ( 2004 – 2015 ) + Christopher Hendrich — drums ( 2004 ) + Mark " Moke " Bistany — drums ( 2004 – 2006 ) + Devin Lebsack — drums ( 2006 – 2007 ) + Anthony " Tiny Bubz " Biuso — drums ( 2007 – 2008 ) + + = = = Timeline = = = + + + = = Discography = = + + Studio albums + Church of Realities ( 1995 ) + Hed PE ( 1997 ) + Broke ( 2000 ) + Blackout ( 2003 ) + Only in Amerika ( 2004 ) + Back 2 Base X ( 2006 ) + Insomnia ( 2007 ) + New World Orphans ( 2009 ) + Truth Rising ( 2010 ) + Evolution ( 2014 ) + Forever ! ( 2016 ) + + + = Ironclad warship = + + An ironclad is a steam @-@ propelled warship protected by iron or steel armor plates used in the early part of the second half of the 19th century . The ironclad was developed as a result of the vulnerability of wooden warships to explosive or incendiary shells . The first ironclad battleship , Gloire , was launched by the French Navy in November 1859 . The British Admiralty had been considering armored warships since 1856 and prepared a draft design for an armored corvette in 1857 ; in early 1859 the Royal Navy started building two iron @-@ hulled armored frigates , and by 1861 had made the decision to move to an all @-@ armored battle fleet . After the first clashes of ironclads ( both with wooden ships and with one another ) took place in 1862 during the American Civil War , it became clear that the ironclad had replaced the unarmored ship of the line as the most powerful warship afloat . This type of ship would come to be very successful in the American Civil War . + Ironclads were designed for several roles , including as high seas battleships , coastal defense ships , and long @-@ range cruisers . The rapid evolution of warship design in the late 19th century transformed the ironclad from a wooden @-@ hulled vessel that carried sails to supplement its steam engines into the steel @-@ built , turreted battleships and cruisers familiar in the 20th century . This change was pushed forward by the development of heavier naval guns ( the ironclads of the 1880s carried some of the heaviest guns ever mounted at sea ) , more sophisticated steam engines , and advances in metallurgy which made steel shipbuilding possible . + The quick pace of change meant that many ships were obsolete as soon as they were finished , and that naval tactics were in a state of flux . Many ironclads were built to make use of the ram or the torpedo , which a number of naval designers considered the important weapons of naval combat . There is no clear end to the ironclad period , but towards the end of the 1890s the term ironclad dropped out of use . New ships were increasingly constructed to a standard pattern and designated battleships or armored cruisers . + + = = The ironclad = = + + The ironclad became technically feasible and tactically necessary because of developments in shipbuilding in the first half of the 19th century . According to naval historian J. Richard Hill : " The ( ironclad ) had three chief characteristics : a metal @-@ skinned hull , steam propulsion and a main armament of guns capable of firing explosive shells . It is only when all three characteristics are present that a fighting ship can properly be called an ironclad . " Each of these developments was introduced separately in the decade before the first ironclads . + + = = = Steam propulsion = = = + + In the 18th and early 19th centuries fleets had relied on two types of major warship , the ship of the line and the frigate . The first major change to these types was the introduction of steam power for propulsion . While paddle steamer warships had been used from the 1830s onwards , steam propulsion only became suitable for major warships after the adoption of the screw propeller in the 1840s . + Steam @-@ powered screw frigates were built in the mid @-@ 1840s , and at the end of the decade the French Navy introduced steam power to its line of battle . The desire for change came from the ambition of Napoleon III to gain greater influence in Europe , which required a challenge to the British at sea . The first purpose @-@ built steam battleship was the 90 @-@ gun Napoléon in 1850 . Napoléon was armed as a conventional ship @-@ of @-@ the @-@ line , but her steam engines could give her a speed of 12 knots ( 22 km / h ) , regardless of the wind conditions : a potentially decisive advantage in a naval engagement . + The introduction of the steam ship @-@ of @-@ the @-@ line led to a building competition between France and Britain . Eight sister ships to Napoléon were built in France over a period of ten years , but the United Kingdom soon managed to take the lead in production . Altogether , France built ten new wooden steam battleships and converted 28 from older ships of the line , while the United Kingdom built 18 and converted 41 . + + = = = Explosive shells = = = + + The era of the wooden steam ship @-@ of @-@ the @-@ line was brief , because of new , more powerful naval guns . In the 1820s and 1830s , warships began to mount increasingly heavy guns , replacing 18- and 24 @-@ pounder guns with 32 @-@ pounders on sailing ships @-@ of @-@ the @-@ line and introducing 68 @-@ pounders on steamers . Then , the first shell guns firing explosive shells were introduced following their development by the French Général Henri @-@ Joseph Paixhans , and by the 1840s were part of the standard armament for naval powers including the French Navy , Royal Navy , Imperial Russian Navy and United States Navy . It is often held that the power of explosive shells to smash wooden hulls , as demonstrated by the Russian destruction of an Ottoman squadron at the Battle of Sinop , spelled the end of the wooden @-@ hulled warship . The more practical threat to wooden ships was from conventional cannon firing red @-@ hot shot , which could lodge in the hull of a wooden ship and cause a fire or ammunition explosion . Some navies even experimented with hollow shot filled with molten metal for extra incendiary power . + + = = = Iron armor = = = + + The use of iron instead of wood as the primary material of ships ' hulls began in the 1830s ; the first " warship " with an iron hull was the gunboat Nemesis , built by Laird for the East India Company in 1839 . There followed , also from Laird , the first full @-@ blown warships with metal hulls , the 1842 steam frigates Guadelupe and Montezuma for the Mexican navy . But a thin iron skin , while not being susceptible to fire or lethal splintering like wood , was not the same thing as providing iron armor calculated to stop enemy gunfire . + Following the demonstration of the power of explosive shells against wooden ships at the Battle of Sinop , and fearing that his own ships would be vulnerable to the Paixhans guns of Russian fortifications in the Crimean War , Emperor Napoleon III ordered the development of light @-@ draft floating batteries , equipped with heavy guns and protected by heavy armor . Experiments made during the first half of 1854 proved highly satisfactory , and on 17 July 1854 , the French communicated to the British Government that a solution had been found to make gun @-@ proof vessels and that plans would be communicated . After tests in September 1854 , the British Admiralty agreed to build five armoured floating batteries on the French plans , establishing the important Thames and Millwall Iron Works within the docks . + The French floating batteries were deployed in 1855 as a supplement to the wooden steam battle fleet in the Crimean War . The role of the battery was to assist unarmored mortar and gunboats bombarding shore fortifications . The French used three of their ironclad batteries ( Lave , Tonnante and Dévastation ) in 1855 against the defenses at the Battle of Kinburn on the Black Sea , where they were effective against Russian shore defences . They would later be used again during the Italian war in the Adriatic in 1859 . The British floating batteries Glatton and Meteor arrived too late to participate to the action at Kinburn . The British planned to use theirs in the Baltic Sea against the well @-@ fortified naval base at Kronstadt . + The batteries have a claim to the title of the first ironclad warships but they were capable of only 4 knots ( 7 km / h ) under their own power : they operated under their own power at the Battle of Kinburn , but had to be towed for long range transit . They were also arguably marginal to the work of the navy . The brief success of the floating ironclad batteries convinced France to begin work on armored warships for their battlefleet . + + = = Early ironclad ships and battles = = + + By the end of the 1850s it was clear that France was unable to match British building of steam warships , and to regain the strategic initiative a dramatic change was required . The result was the first ocean @-@ going ironclad , the Gloire , begun in 1857 and launched in 1859 . + Gloire 's wooden hull was modelled on that of a steam ship of the line , reduced to one deck , sheathed in iron plates 4 @.@ 5 inches ( 110 mm ) thick . She was propelled by a steam engine , driving a single screw propeller for a speed of 13 knots ( 24 km / h ) . She was armed with thirty @-@ six 6 @.@ 4 @-@ inch ( 160 mm ) rifled guns . France proceeded to construct 16 ironclad warships , including two more sister ships to Gloire , and the only two @-@ decked broadside ironclads ever built , Magenta and Solférino . + The Royal Navy had not been keen to sacrifice its advantage in steam ships of the line , but was determined that the first British ironclad would outmatch the French ships in every respect , particularly speed . A fast ship would have the advantage of being able to choose a range of engagement which could make her invulnerable to enemy fire . The British specification was more a large , powerful frigate than a ship @-@ of @-@ the @-@ line . The requirement for speed meant a very long vessel , which had to be built from iron . The result was the construction of two Warrior @-@ class ironclads ; HMS Warrior and HMS Black Prince . The ships had a successful design , though there were necessarily compromises between ' sea @-@ keeping ' , strategic range and armour protection ; their weapons were more effective than that of Gloire , and with the largest set of steam engines yet fitted to a ship they could steam at 14 @.@ 3 knots ( 26 @.@ 5 km / h ) . Yet the Gloire and her sisters had full iron @-@ armour protection along the waterline and the battery itself . Warrior and Black Prince ( but also the smaller Defence and Resistance ) were obliged to concentrate their armour in a central ' citadel ' or ' armoured box ' , leaving many main deck guns and the fore and aft sections of the vessel unprotected . The use of iron in the construction of Warrior also came with some drawbacks ; iron hulls required more regular and intensive repairs than wooden hulls , and iron was more susceptible to fouling by marine life . + By 1862 , navies across Europe had adopted ironclads . Britain and France each had sixteen either completed or under construction , though the British vessels were larger . Austria , Italy , Russia , and Spain were also building ironclads . However , the first battles using the new ironclad ships involved neither Britain nor France , and involved ships markedly different from the broadside @-@ firing , masted designs of Gloire and Warrior . The use of ironclads by both sides in the American Civil War , and the clash of the Italian and Austrian fleets at the Battle of Lissa , had an important influence on the development of ironclad design . + + = = = First battles between ironclads : the U.S. Civil War = = = + + The first use of ironclads in action came in the U.S. Civil War . The U.S. Navy at the time the war broke out had no ironclads , its most powerful ships being six steam @-@ powered unarmoured frigates . Since the bulk of the Navy remained loyal to the Union , the Confederacy sought to gain advantage in the naval conflict by acquiring modern armored ships . In May 1861 , the Confederate Congress voted that $ 2 million be appropriated for the purchase of ironclads from overseas , and in July and August 1861 the Confederacy started work on construction and converting wooden ships . + On 12 October 1861 , the CSS Manassas became the first ironclad to enter combat , when she fought Union warships on the Mississippi during the Battle of the Head of Passes . She had been converted from a commercial vessel in New Orleans for river and coastal fighting . In February 1862 , the larger CSS Virginia joined the Confederate Navy , having been rebuilt at Norfolk . Constructed on the hull of USS Merrimack , Virginia originally was a conventional warship made of wood , but she was converted into an iron @-@ covered casemate ironclad gunship , when she entered the Confederate navy . By this time , the Union had completed seven ironclad gunboats of the City class , and was about to complete the USS Monitor , an innovative design proposed by the Swedish inventor John Ericsson . The Union was also building a large armored frigate , the USS New Ironsides , and the smaller USS Galena . + The first battle between ironclads happened on 9 March 1862 , as the armored Monitor was deployed to protect the Union 's wooden fleet from the ironclad ram Virginia and other Confederate warships . In this engagement , the second day of the Battle of Hampton Roads , the two ironclads repeatedly tried to ram one another while shells bounced off their armor . The battle attracted attention worldwide , making it clear that the wooden warship was now out of date , with the ironclads destroying them easily . + The Civil War saw more ironclads built by both sides , and they played an increasing role in the naval war alongside the unarmored warships , commerce raiders and blockade runners . The Union built a large fleet of fifty monitors modeled on their namesake . The Confederacy built ships designed as smaller versions of the Virginia , many of which saw action , but their attempts to buy ironclads overseas were frustrated as European nations confiscated ships being built for the Confederacy — especially in Russia , the only country to openly support the Union through the war . Only CSS Stonewall was completed , and she arrived in American waters just in time for the end of the war . + Through the remainder of the war , ironclads saw action in the Union 's attacks on Confederate ports . Seven Union monitors , including USS Montauk , as well as two other ironclads , the ironclad frigate New Ironsides and a light @-@ draft USS Keokuk , participated in the failed attack on Charleston ; one was sunk . Two small ironclads , CSS Palmetto State and CSS Chicora participated in the defence of the harbor . For the later attack at Mobile Bay , the Union assembled four monitors as well as 11 wooden ships , facing the CSS Tennessee , the Confederacy 's most powerful ironclad and the gunboats CSS Morgan , CSS Gaines , CSS Selma . + On the western front , the Union built a formidable force of river ironclads , beginning with several converted riverboats and then contracting engineer James Eads of St. Louis , Missouri to build the City @-@ class ironclads . These excellent ships were built with twin engines and a central paddle wheel , all protected by an armored casement . They had a shallow draft , allowing them to journey up smaller tributaries , and were very well suited for river operations . Eads also produced monitors for use on the rivers , the first two of which differed from the ocean @-@ going monitors in that they contained a paddle wheel ( the USS Neosho and USS Osage ) . + Arguably Eads vessels were some of the better ironclads of the Western Flotilla , but there were a number of other vessels that served valiantly with the fleet . All were of varying design , some more successful than others , and some were similar to standard riverboats but with armored side @-@ mounted paddle wheels . All were armed with various smoothbore and some rifled guns . If nothing else the experience of the American Civil War and its wild variety of competing ironclad designs , some more successful ( or disastrous ) than others , confirmed the emerging trade @-@ off or compromises required in applying the latest technological advances in iron armour manufacture , ship construction and gun design — to name a few — also going on in Europe . There was no such thing as a ' perfect ' ironclad which could be invincible in every possible encounter ; ship duels , standing up to forts , Brown & Blue @-@ water operations . + The Union ironclads played an important role in the Mississippi and tributaries by providing tremendous fire upon Confederate forts , installations and vessels with relative impunity to enemy fire . They were not as heavily armored as the ocean @-@ going monitors of the Union , but they were adequate for their intended use . More Western Flotilla Union ironclads were sunk by torpedoes ( mines ) than by enemy fire , and the most damaging fire for the Union ironclads was from shore installations , not Confederate vessels . + + = = = Lissa : First fleet battle = = = + + The first fleet battle , and the first ocean battle , involving ironclad warships was the Battle of Lissa in 1866 . Waged between the Austrian and Italian navies , the battle pitted combined fleets of wooden frigates and corvettes and ironclad warships on both sides in the largest naval battle between the battles of Navarino and Tsushima . + The Italian fleet consisted of 12 ironclads and a similar number of wooden warships , escorting transports which carried troops intending to land on the Adriatic island of Lissa . Among the Italian ironclads were seven broadside ironclad frigates , four smaller ironclads , and the newly built Affondatore — a double @-@ turretted ram . Opposing them , the Austrian navy had seven ironclad frigates . + The Austrians believed their ships to have less effective guns than their enemy , so decided to engage the Italians at close range and ram them . The Austrian fleet formed into an arrowhead formation with the ironclads in the first line , charging at the Italian ironclad squadron . In the melée which followed both sides were frustrated by the lack of damage inflicted by guns , and by the difficulty of ramming — nonetheless , the effective ramming attack being made by the Austrian flagship against the Italian attracted great attention in following years . + The superior Italian fleet lost its two ironclads , Re d 'Italia and Palestro , while the Austrian unarmoured screw two @-@ decker SMS Kaiser remarkably survived close actions with four Italian ironclads . The battle ensured the popularity of the ram as a weapon in European ironclads for many years , and the victory won by Austria established it as the predominant naval power in the Adriatic . + The battles of the American Civil War and at Lissa were very influential on the designs and tactics of the ironclad fleets that followed . In particular , it taught a generation of naval officers the misleading lesson that ramming was the best way to sink enemy ironclads . + + = = Armament and tactics = = + + The adoption of iron armor meant that the traditional naval armament of dozens of light cannon became useless , since their shot would bounce off an armored hull . To penetrate armor , increasingly heavy guns were mounted on ships ; nevertheless , the view that ramming was the only way to sink an ironclad became widespread . The increasing size and weight of guns also meant a movement away from the ships mounting many guns broadside , in the manner of a ship @-@ of @-@ the @-@ line , towards a handful of guns in turrets for all @-@ round fire . + + = = = Ram craze = = = + + From the 1860s to the 1880s many naval designers believed that the development of the ironclad meant that the ram was again the most important weapon in naval warfare . With steam power freeing ships from the wind , and armor making them invulnerable to shellfire , the ram seemed to offer the opportunity to strike a decisive blow . + The scant damage inflicted by the guns of Monitor and Virginia at Battle of Hampton Roads and the spectacular but lucky success of the Austrian flagship SMS Erzherzog Ferdinand Max sinking the Italian Re d 'Italia at Lissa gave strength to the ramming craze . From the early 1870s to early 1880s most British naval officers thought that guns were about to be replaced as the main naval armament by the ram . Those who noted the tiny number of ships that had actually been sunk by ramming struggled to be heard . + The revival of ramming had a significant effect on naval tactics . Since the 17th century the predominant tactic of naval warfare had been the line of battle , where a fleet formed a long line to give it the best fire from its broadside guns . This tactic was totally unsuited to ramming , and the ram threw fleet tactics into disarray . The question of how an ironclad fleet should deploy in battle to make best use of the ram was never tested in battle , and if it had been , combat might have shown that rams could only be used against ships which were already stopped dead in the water . + The ram finally fell out of favour in the 1880s , as the same effect could be achieved with a torpedo , with less vulnerability to quick @-@ firing guns . + + = = = Development of naval guns = = = + + The armament of ironclads tended to become concentrated in a small number of powerful guns capable of penetrating the armor of enemy ships at range ; calibre and weight of guns increased markedly to achieve greater penetration . Throughout the ironclad era navies also grappled with the complexities of rifled versus smoothbore guns and breech @-@ loading versus muzzle @-@ loading . + HMS Warrior carried a mixture of 110 @-@ pounder 7 inch ( 180 mm ) breech @-@ loading rifles and more traditional 68 @-@ pounder smoothbore guns . Warrior highlighted the challenges of picking the right armament ; the breech @-@ loaders she carried , designed by Sir William Armstrong , were intended to be the next generation of heavy armament for the Royal Navy , but were shortly withdrawn from service . + Breech @-@ loading guns seemed to offer important advantages . A breech @-@ loader could be reloaded without moving the gun , a lengthy process particularly if the gun then needed to be re @-@ aimed . The Warrior 's Armstrong guns also had the virtue of being lighter than an equivalent smoothbore and , because of their rifling , more accurate . Nonetheless , the design was rejected because of problems which plagued breech @-@ loaders for decades . + The weakness of the breech @-@ loader was the obvious problem of sealing the breech . All guns are powered by the explosive conversion of gunpowder into gas . This explosion propels the shot or shell out of the front of the gun , but also imposes great stresses on the gun @-@ barrel . If the breech — which experiences some of the greatest forces in the gun — is not entirely secure , then there is a risk that either gas will discharge through the breech or that the breech will break . This in turn reduces the muzzle velocity of the weapon and can also endanger the gun crew . The Warrior 's Armstrong guns suffered from both problems ; the shells were unable to penetrate the 4 @.@ 5 in ( 118 mm ) armor of Gloire , while sometimes the screw which closed the breech flew backwards out of the gun on firing . Similar problems were experienced with the breech @-@ loading guns which became standard in the French and German navies . + These problems influenced the British to equip ships with muzzle @-@ loading weapons of increasing power until the 1880s . After a brief introduction of 100 @-@ pounder or 9 @.@ 5 @-@ inch ( 240 mm ) smoothbore Somerset Gun , which weighed 6 @.@ 5 tons ( 6 @.@ 6 t ) , the Admiralty introduced 7 @-@ inch ( 178 mm ) rifled guns , weighing 7 tons . These were followed by a series of increasingly mammoth weapons — guns weighing 12 , 25 , 25 , 38 and finally 81 tons , with calibre increasing from 8 @-@ inch ( 203 mm ) to 16 @-@ inch ( 406 mm ) . + The decision to retain muzzle @-@ loaders until the 1880s has been criticised by historians . However , at least until the late 1870s , the British muzzle @-@ loaders had superior performance in terms of both range and rate of fire than the French and Prussian breech @-@ loaders , which suffered from the same problems as had the first Armstrong guns . + From 1875 onwards , the balance between breech- and muzzle @-@ loading changed . Captain de Bange invented a method of reliably sealing a breech , adopted by the French in 1873 . Just as compellingly , the growing size of naval guns made muzzle @-@ loading much more complicated . With guns of such size there was no prospect of hauling in the gun for re @-@ loading , or even re @-@ loading by hand , and complicated hydraulic systems were required for re @-@ loading the gun outside the turret without exposing the crew to enemy fire . In 1882 , the 81 @-@ ton , 16 @-@ inch ( 406 mm ) guns of HMS Inflexible fired only once every 11 minutes while bombarding Alexandria during the Urabi Revolt . The 100 @-@ ton , 450 mm ( 17 @.@ 72 inch ) guns of Caio Duilio could each fire a round every 15 minutes . + In the Royal Navy , the switch to breech @-@ loaders was finally made in 1879 ; as well as the significant advantages in terms of performance , opinion was swayed by an explosion on board HMS Thunderer caused by a gun being double @-@ loaded , a problem which could only happen with a muzzle @-@ loading gun . + The calibre and weight of guns could only increase so far . The larger the gun , the slower it would be to load , the greater the stresses on the ship 's hull , and the less the stability of the ship . The size of the gun peaked in the 1880s , with some of the heaviest calibres of gun ever used at sea . HMS Benbow carried two 16 @.@ 25 @-@ inch ( 413 mm ) breech @-@ loading guns , each weighing 110 tons — no British battleship would ever carry guns as large . The Italian 450 mm ( 17 @.@ 72 inch ) guns would be larger than any gun fitted to a battleship until the 18 @.@ 1 @-@ inch ( 460 mm ) armament of the Japanese Yamato class of World War II . One consideration which became more acute was that even from the original Armstrong models , following the Crimean War , range and hitting power far exceeded simple accuracy , especially at sea where the slightest roll or pitch of the vessel as ' floating weapons @-@ platform ' could negate the advantage of rifling . American ordnance experts accordingly preferred smoothbore monsters whose round shot could at least ' skip ' along the surface of the water . Actual effective combat ranges , they had learned during the Civil War , were comparable to those in the Age of Sail — though a vessel could now be smashed to pieces in only a few rounds . Smoke and the general chaos of battle only added to the problem . As a result , many naval engagements in the ' Age of the Ironclad ' were still fought at ranges within easy eyesight of their targets , and well below the maximum reach of their ships ' guns . + Another method of increasing firepower was to vary the projectile fired or the nature of the propellant . Early ironclads used black powder , which expanded rapidly after combustion ; this meant cannons had relatively short barrels , to prevent the barrel itself slowing the shell . The sharpness of the black powder explosion also meant that guns were subjected to extreme stress . One important step was to press the powder into pellets , allowing a slower , more controlled explosion and a longer barrel . A further step forward was the introduction of chemically different brown powder which combusted more slowly again . It also put less stress on the insides of the barrel , allowing guns to last longer and to be manufactured to tighter tolerances . + The development of smokeless powder , based on nitroglycerine or nitrocellulose , by the French inventor Paul Vielle in 1884 was a further step allowing smaller charges of propellant with longer barrels . The guns of the pre @-@ Dreadnought battleships of the 1890s tended to be smaller in calibre compared to the ships of the 1880s , most often 12 in ( 305 mm ) , but progressively grew in length of barrel , making use of improved propellants to gain greater muzzle velocity . + The nature of the projectiles also changed during the ironclad period . Initially , the best armor @-@ piercing projectile was a solid cast @-@ iron shot . Later , shot of chilled iron , a harder iron alloy , gave better armor @-@ piercing qualities . Eventually the armor @-@ piercing shell was developed . + + = = = Positioning of armament = = = + + + = = = = Broadside ironclads = = = = + + The first British , French and Russian ironclads , in a logical development of warship design from the long preceding era of wooden ships of the line , carried their weapons in a single line along their sides and so were called " broadside ironclads . " Both Gloire and HMS Warrior were examples of this type . Because their armor was so heavy , they could only carry a single row of guns along the main deck on each side rather than a row on each deck . + A significant number of broadside ironclads were built in the 1860s , principally in Britain and France , but in smaller numbers by other powers including Italy , Austria , Russia and the United States . The advantages of mounting guns on both broadsides was that the ship could engage more than one adversary at a time , and the rigging did not impede the field of fire . + Broadside armament also had disadvantages , which became more serious as ironclad technology developed . Heavier guns to penetrate ever @-@ thicker armor meant that fewer guns could be carried . Furthermore , the adoption of ramming as an important tactic meant the need for ahead and all @-@ round fire . These problems led to broadside designs being superseded by designs that gave greater all @-@ round fire , which included central @-@ battery , turret , and barbette designs . + + = = = = Turrets , batteries and barbettes = = = = + + There were two main design alternatives to the broadside . In one design , the guns were placed in an armoured casemate amidships : this arrangement was called the ' box @-@ battery ' or ' centre @-@ battery ' . In the other , the guns could be placed on a rotating platform to give them a broad field of fire ; when fully armored , this arrangement was called a turret and when partially armored or unarmored , a barbette . + The centre @-@ battery was the simpler and , during the 1860s and 1870s , the more popular method . Concentrating guns amidships meant the ship could be shorter and handier than a broadside type . The first full @-@ scale centre @-@ battery ship was HMS Bellerophon of 1865 ; the French laid down centre @-@ battery ironclads in 1865 which were not completed until 1870 . Centre @-@ battery ships often , but not always , had a recessed freeboard enabling some of their guns to fire directly ahead . + The turret was first used in naval combat on the USS Monitor in 1862 , with a type of turret designed by the Swedish engineer John Ericsson . A competing turret design was proposed by the British inventor Cowper Coles with a prototype of this installed on HMS Trusty in 1861 for testing and evaluation purposes . Ericsson 's turret turned on a central spindle , and Coles 's turned on a ring of bearings . Turrets offered the maximum arc of fire from the guns , but there were significant problems with their use in the 1860s . The fire arc of a turret would be considerably limited by masts and rigging , so they were unsuited to use on the earlier ocean @-@ going ironclads . The second problem was that turrets were extremely heavy . Ericsson was able to offer the heaviest possible turret ( guns and armour protection ) by deliberately designing a ship with very low freeboard . The weight thus saved from having a high broadside above the waterline was diverted to actual guns and armour . Low freeboard , however , also meant a smaller hull and therefore a smaller capacity for coal storage — and therefore range of the vessel . In many respects , the turreted , low @-@ freeboard Monitor and the broadside sailer HMS Warrior represented two opposite extremes in what an ' Ironclad ' was all about . The most dramatic attempt to compromise these two extremes , or ' squaring this circle ' , was designed by Captain Cowper Phipps Coles : HMS Captain , a dangerously low freeboard turret ship which nevertheless carried a full rig of sail , and which subsequently capsized not long after her launch in 1870 . Her half @-@ sister HMS Monarch was restricted to firing from her turrets only on the port and starboard beams . The third Royal Navy ship to combine turrets and masts was HMS Inflexible of 1876 , which carried two turrets on either side of the centre @-@ line , allowing both to fire fore , aft and broadside . + A lighter alternative to the turret , particularly popular with the French navy , was the barbette . These were fixed armored towers which held a gun on a turntable . The crew was sheltered from direct fire , but vulnerable to plunging fire , for instance from shore emplacements . The barbette was lighter than the turret , needing less machinery and no roof armor — though nevertheless some barbettes were stripped of their armor plate to reduce the top @-@ weight of their ships . The barbette became widely adopted in the 1880s , and with the addition of an armored ' gun @-@ house ' , transformed into the turrets of the pre @-@ Dreadnought battleships . + + = = = Torpedoes = = = + + The ironclad age saw the development of explosive torpedoes as naval weapons , which helped complicate the design and tactics of ironclad fleets . The first torpedoes were static mines , used extensively in the American Civil War . That conflict also saw the development of the spar torpedo , an explosive charge pushed against the hull of a warship by a small boat . For the first time , a large warship faced a serious threat from a smaller one — and given the relative inefficiency of shellfire against ironclads , the threat from the spar torpedo was taken seriously . The U.S. Navy converted four of its monitors to become turretless armored spar @-@ torpedo vessels while under construction in 1864 – 5 , but these vessels never saw action . Another proposal , the towed or ' Harvey ' torpedo , involved an explosive on a line or outrigger ; either to deter a ship from ramming or to make a torpedo attack by a boat less suicidal . + A more practical and influential weapon was the self @-@ propelled or Whitehead torpedo . Invented in 1868 and deployed in the 1870s , the Whitehead torpedo formed part of the armament of ironclads of the 1880s like HMS Inflexible and the Italian Caio Duilio and Enrico Dandolo . The ironclad 's vulnerability to the torpedo was a key part of the critique of armored warships made by the Jeune Ecole school of naval thought ; it appeared that any ship armored enough to prevent destruction by gunfire would be slow enough to be easily caught by torpedo . In practice , however , the Jeune Ecole was only briefly influential and the torpedo formed part of the confusing mixture of weapons possessed by ironclads . + + = = Armor and construction = = + + The first ironclads were built on wooden or iron hulls , and protected by wrought iron armor backed by thick wooden planking . Ironclads were still being built with wooden hulls into the 1870s . + + = = = Hulls : iron , wood and steel = = = + + Using iron construction for warships offered advantages for the engineering of the hull . However , unarmored iron had many military disadvantages , and offered technical problems which kept wooden hulls in use for many years , particularly for long @-@ range cruising warships . + Iron ships had first been proposed for military use in the 1820s . In the 1830s and 1840s , France , Britain and the United States had all experimented with iron @-@ hulled but unarmored gunboats and frigates . However , the iron @-@ hulled frigate was abandoned by the end of the 1840s , because iron hulls were more vulnerable to solid shot ; iron was more brittle than wood , and iron frames more likely to fall out of shape than wood . + The unsuitability of unarmored iron for warship hulls meant that iron was only adopted as a building material for battleships when protected by armor . However , iron gave the naval architect many advantages . Iron allowed larger ships and more flexible design , for instance the use of watertight bulkheads on the lower decks . Warrior , built of iron , was longer and faster than the wooden @-@ hulled Gloire . Iron could be produced to order and used immediately , in contrast to the need to give wood a long period of seasoning . And , given the large quantities of wood required to build a steam warship and the falling cost of iron , iron hulls were increasingly cost @-@ effective . The main reason for the French use of wooden hulls for the ironclad fleet built in the 1860s was that the French iron industry could not supply enough , and the main reason why Britain built its handful of wooden @-@ hulled ironclads was to make best use of hulls already started and wood already bought . + Wooden hulls continued to be used for long @-@ range and smaller ironclads , because iron nevertheless had a significant disadvantage . Iron hulls suffered quick fouling by marine life , slowing the ships down — manageable for a European battlefleet close to dry docks , but a difficulty for long @-@ range ships . The only solution was to sheath the iron hull first in wood and then in copper , a laborious and expensive process which made wooden construction remain attractive . Iron and wood were to some extent interchangeable : the Japanese Kongō and Hiei ordered in 1875 were sister @-@ ships , but one was built of iron and the other of composite construction . + After 1872 , steel started to be introduced as a material for construction . Compared to iron , steel allows for greater structural strength for a lower weight . The French Navy led the way with the use of steel in its fleet , starting with the Redoutable , laid down in 1873 and launched in 1876 . Redoutable nonetheless had wrought iron armor plate , and part of her exterior hull was iron rather than steel . + Even though Britain led the world in steel production , the Royal Navy was slow to adopt steel warships . The Bessemer process for steel manufacture produced too many imperfections for large @-@ scale use on ships . French manufacturers used the Siemens @-@ Martin process to produce adequate steel , but British technology lagged behind . The first all @-@ steel warships built by the Royal Navy were the dispatch vessels Iris and Mercury , laid down in 1875 and 1876 . + + = = = Armor and protection schemes = = = + + Iron @-@ built ships used wood as part of their protection scheme . HMS Warrior was protected by 4 @.@ 5 in ( 114 mm ) of wrought iron backed by 15 in ( 381 mm ) of teak , the strongest shipbuilding wood . The wood played two roles , preventing spalling and also preventing the shock of a hit damaging the structure of the ship . Later , wood and iron were combined in ' sandwich ' armor , for instance in HMS Inflexible . + Steel was also an obvious material for armor . It was tested in the 1860s , but the steel of the time was too brittle and disintegrated when struck by shells . Steel became practical to use when a way was found to fuse steel onto wrought iron plates , giving a form of compound armor . This compound armor was used by the British in ships built from the late 1870s , first for turret armor ( starting with HMS Inflexible ) and then for all armor ( starting with HMS Colossus of 1882 ) . The French and German navies adopted the innovation almost immediately , with licenses being given for the use of the ' Wilson System ' of producing fused armor . + The first ironclads to have all @-@ steel armor were the Italian Caio Duilio and Enrico Dandolo . Though the ships were laid down in 1873 their armor was not purchased from France until 1877 . The French navy decided in 1880 to adopt compound armor for its fleet , but found it limited in supply , so from 1884 the French navy was using steel armor . Britain stuck to compound armor until 1889 . + The ultimate ironclad armor was case hardened nickel @-@ steel . In 1890 , the U.S. Navy tested steel armor hardened by the Harvey process and found it superior to compound armor . For several years ' Harvey steel ' was the state of the art , produced in the U.S. , France , Germany , Britain , Austria and Italy . In 1894 , the German firm Krupp developed gas cementing , which further hardened steel armor . The German Kaiser Friedrich III , laid down in 1895 , was the first ship to benefit from the new ' Krupp armor ' and the new armor was quickly adopted ; the Royal Navy using it from HMS Canopus , laid down in 1896 . By 1901 almost all new battleships used Krupp armor , though the U.S. continued to use Harvey armor alongside until the end of the decade . + The equivalent strengths of the different armor plates was as follows : 15 in ( 381 mm ) of wrought iron was equivalent to 12 in ( 305 mm ) of either plain steel or compound iron and steel armor , and to 7 @.@ 75 in ( 197 mm ) of Harvey armor or 5 @.@ 75 in ( 146 mm ) of Krupp armor . + Ironclad construction also prefigured the later debate in battleship design between tapering and ' all @-@ or @-@ nothing ' armour design . Warrior was only semi @-@ armoured , and could have been disabled by hits on the bow and stern . As the thickness of armor grew to protect ships from the increasingly heavy guns , the area of the ship which could be fully protected diminished . Inflexible 's armor protection was largely limited to the central citadel amidships , protecting boilers and engines , turrets and magazines , and little else . An ingenious arrangement of cork @-@ filled compartments and watertight bulkheads was intended to keep her stable and afloat in the event of heavy damage to her un @-@ armored sections . + + = = Propulsion : steam and sail = = + + The first ocean @-@ going ironclads carried masts and sails like their wooden predecessors , and these features were only gradually abandoned . Early steam engines were inefficient ; the wooden steam fleet of the Royal Navy could only carry " 5 to 9 days coal " , and the situation was similar with the early ironclads . Warrior also illustrates two design features which aided hybrid propulsion ; she had retractable screws to reduce drag while under sail ( though in practice the steam engine was run at a low throttle ) , and a telescopic funnel which could be folded down to the deck level . + Ships designed for coastal warfare , like the floating batteries of the Crimea , or USS Monitor and her sisters , dispensed with masts from the beginning . The British HMS Devastation , started in 1869 , was the first large , ocean @-@ going ironclad to dispense with masts . Her principal role was for combat in the English Channel and other European waters ; and while her coal supplies gave her enough range to cross the Atlantic , she would have had little endurance on the other side of the ocean . The Devastation and the similar ships commissioned by the British and Russian navies in the 1870s were the exception rather than the rule . Most ironclads of the 1870s retained masts , and only the Italian navy , which during that decade was focused on short @-@ range operations in the Adriatic , built consistently mastless ironclads . + During the 1860s , steam engines improved with the adoption of double @-@ expansion steam engines , which used 30 – 40 % less coal than earlier models . The Royal Navy decided to switch to the double @-@ expansion engine in 1871 , and by 1875 they were widespread . However , this development alone was not enough to herald the end of the mast . Whether this was due to a conservative desire to retain sails , or was a rational response to the operational and strategic situation , is a matter of debate . A steam @-@ only fleet would require a network of coaling stations worldwide , which would need to be fortified at great expense to stop them falling into enemy hands . Just as significantly , because of unsolved problems with the technology of the boilers which provided steam for the engines , the performance of double @-@ expansion engines was rarely as good in practice as it was in theory . + During the 1870s the distinction grew between ' first @-@ class ironclads ' or ' battleships ' on the one hand , and ' cruising ironclads ' designed for long @-@ range work on the other . The demands on first @-@ class ironclads for very heavy armor and armament meant increasing displacement , which reduced speed under sail ; and the fashion for turrets and barbettes made a sailing rig increasingly inconvenient . HMS Inflexible , launched in 1876 but not commissioned until 1881 , was the last British battleship to carry masts , and these were widely seen as a mistake . The start of the 1880s saw the end of sailing rig on ironclad battleships . + Sails persisted on ' cruising ironclads ' for much longer . During the 1860s , the French navy had produced the Alma and La Galissonnière classes as small , long @-@ range ironclads as overseas cruisers and the British had responded with ships like HMS Swiftsure of 1870 . The Russian ship General @-@ Admiral , laid down in 1870 and completed in 1875 , was a model of a fast , long @-@ range ironclad which was likely to be able to outrun and outfight ships like Swiftsure . Even the later HMS Shannon , often described as the first British armored cruiser , would have been too slow to outrun General @-@ Admiral . While Shannon was the last British ship with a retractable propellor , later armored cruisers of the 1870s retained sailing rig , sacrificing speed under steam in consequence . It took until 1881 for the Royal Navy to lay down a long @-@ range armored warship capable of catching enemy commerce raiders , HMS Warspite , which was completed in 1888 . While sailing rigs were obsolescent for all purposes by the end of the 1880s , rigged ships were in service until the early years of the 20th century . + The final evolution of ironclad propulsion was the adoption of the triple @-@ expansion steam engine , a further refinement which was first adopted in HMS Sans Pareil , laid down in 1885 and commissioned in 1891 . Many ships also used a forced draught to get additional power from their engines , and this system was widely used until the introduction of the steam turbine in the mid @-@ 1900s ( decade ) . + + = = Fleets = = + + While ironclads spread rapidly in navies worldwide , there were few pitched naval battles involving ironclads . Most European nations settled differences on land , and the Royal Navy struggled to maintain a deterrent parity with at least France , while providing suitable protection to Britain 's commerce and colonial outposts worldwide . Ironclads remained , for the British Royal Navy , a matter of defending the British Isles first and projecting power abroad second . Those naval engagements of the latter half of the 19th @-@ century which involved ironclads normally involved colonial actions or clashes between second @-@ rate naval powers . But these encounters were often enough to convince British policy @-@ makers of the increasing hazards of strictly naval foreign intervention , from Hampton Roads in the American Civil War to the hardening combined defences of naval arsenals such as Kronstadt and Cherbourg . + There were many types of ironclads : + Seagoing ships intended to " stand in the line of battle " ; the precursors of the battleship . + Coastal service and riverine vessels , including ' floating batteries ' and ' monitors' + Vessels intended for commerce raiding or protection of commerce , called ' armoured cruisers' + + = = = Navies = = = + + The United Kingdom possessed the largest navy in the world for the whole of the ironclad period . The Royal Navy was the second to adopt ironclad warships , and it applied them worldwide in their whole range of roles . In the age of sail , the British strategy for war depended on the Royal Navy mounting a blockade of the ports of the enemy . Because of the limited endurance of steamships , this was no longer possible , so the British at times considered the risk @-@ laden plan of engaging an enemy fleet in harbor as soon as war broke out . To this end , the Royal Navy developed a series of ' coast @-@ defence battleships ' , starting with the Devastation class . These ' breastwork monitors ' were markedly different from the other high @-@ seas ironclads of the period and were an important precursor of the modern battleship . As long @-@ range monitors they could reach Bermuda unescorted , for example . However , they were still armed with only four heavy guns and were as vulnerable to mines and obstructions ( and enemy monitors ) as the original monitors of the Union Navy proved to be during the Civil War . The British prepared for an overwhelming mortar bombardment of Kronstadt by the close of the Crimean War , but never considered running the smoke @-@ ridden , shallow @-@ water gauntlet straight to St. Petersburg with ironclads . Likewise , monitors proved acutely unable to ' overwhelm ' enemy fortifications single @-@ handed during the American conflict , though their low @-@ profile and heavy armour protection made them ideal for running gauntlets . Mines and obstructions , however , negated these advantages — a problem the British Admiralty frequently acknowledged but never countered throughout the period . The British never laid down enough Devastation @-@ class ' battleships ' to instantly overwhelm Cherbourg , Kronstadt or even New York City with gunfire . Although throughout the 1860s and 1870s the Royal Navy was still in many respects superior to its potential rivals , by the early 1880s widespread concern about the threat from France and Germany culminated in the Naval Defence Act , which promulgated the idea of a ' two @-@ power standard ' , that Britain should possess as many ships as the next two navies combined . This standard provoked aggressive shipbuilding in the 1880s and 1890s . + British ships did not participate in any major wars in the ironclad period . The Royal Navy 's ironclads only saw action as part of colonial battles or one @-@ sided engagements like the bombardment of Alexandria in 1882 . Defending British interests against Ahmed ' Urabi 's Egyptian revolt , a British fleet opened fire on the fortifications around the port of Alexandria . A mixture of centre @-@ battery and turret ships bombarded Egyptian positions for most of a day , forcing the Egyptians to retreat ; return fire from Egyptian guns was heavy at first , but inflicted little damage , killing only five British sailors . Few Egyptian guns were actually dismounted , on the other hand , and the fortifications themselves were typically left intact . Had the Egyptians actually utilised the heavy mortars that were at their disposal , they might have quickly turned the tide , for the attacking British ironclads found it easy ( for accuracy 's sake ) to simply anchor whilst firing — perfect targets for high @-@ angle fire upon their thinly armoured topdecks . + The French navy built the first ironclad to try to gain a strategic advantage over the British , but were consistently out @-@ built by the British . Despite taking the lead with a number of innovations like breech @-@ loading weapons and steel construction , the French navy could never match the size of the Royal Navy . In the 1870s , the construction of ironclads ceased for a while in France as the Jeune Ecole school of naval thought took prominence , suggesting that torpedo boats and unarmored cruisers would be the future of warships . Like the British , the French navy saw little action with its ironclads ; the French blockade of Germany in the Franco @-@ Prussian War was ineffective , as the war was settled entirely on land . + Russia built a number of ironclads , generally copies of British or French designs . Nonetheless , there were real innovations from Russia ; the first true type of ironclad armored cruiser , the General @-@ Admiral of the 1870s , and a set of remarkably badly designed circular battleships referred to as ' popovkas ' ( for Admiral Popov , who conceived the design ) . The Russian Navy pioneered the wide @-@ scale use of torpedo boats during the Russo @-@ Turkish War of 1877 – 1878 , mainly out of necessity because of the superior numbers and quality of ironclads used by the Turkish navy . Russia expanded her navy in the 1880s and 1890s with modern armored cruisers and battleships , but the ships were manned by inexperienced crews and politically appointed leadership , which enhanced their defeat in the Battle of Tsushima on 27 May 1905 . + The U.S. Navy ended the Civil War with about fifty monitor @-@ type coastal ironclads ; by the 1870s most of these were laid up in reserve , leaving the USA virtually without an ironclad fleet . Another five large monitors were ordered in the 1870s . The limitations of the monitor type effectively prevented the USA from projecting power overseas , and until the 1890s the USA would have come off badly in a conflict with even Spain or the Latin American powers . The 1890s saw the beginning of what became the Great White Fleet , and it was the modern pre @-@ Dreadnoughts and armored cruisers built in the 1890s which defeated the Spanish fleet in the Spanish – American War of 1898 . This started a new era of naval warfare . + Ironclads were widely used in South America . Both sides used ironclads in the Chincha Islands War between Spain and the combined forces of Peru and Chile in the early 1860s . The powerful Spanish Numancia participated in the Battle of Callao but was unable to inflict significant damage to the Callao defences . Besides , Peru was able to deploy two locally built ironclads based on American Civil War designs , the Loa ( a wooden ship converted into a casemate ironclad ) and the Victoria ( a small monitor armed with a single 68 @-@ pdr gun ) , as well as two British @-@ built ironclads : Independencia , a centre @-@ battery ship , and the turret ship Huáscar . Numancia was the first ironclad to circumnavigate the world , arriving in Cádiz on 20 September 1867 , and earning the motto : " Enloricata navis que primo terram circuivit " [ " First ironclad ship to sail around the world " ] ) . In the War of the Pacific in 1879 , both Peru and Chile had ironclad warships , including some of those used a few years previously against Spain . While the Independencia ran aground early on , the Peruvian ironclad ' Huáscar made a great impact against Chilean shipping , delaying Chilean ground invasion by six months . She was eventually caught by two more modern Chilean centre @-@ battery ironclads , the Blanco Encalada and the Almirante Cochrane at the Battle of Angamos Point . + Ironclads were also used from the inception of the Imperial Japanese Navy . The Kōtetsu ( Japanese : 甲鉄 , literally " Ironclad " , later renamed Azuma 東 , " East " ) had a decisive role in the Naval Battle of Hakodate Bay in May 1869 , which marked the end of the Boshin War , and the complete establishment of the Meiji Restoration . The IJN continued to develop its strength and commissioned a number of warships from British and European shipyards , first ironclads and later armored cruisers . These ships engaged the Chinese Beiyang fleet which was superior on paper at least at the Battle of the Yalu River . Thanks to superior short @-@ range firepower , the Japanese fleet came off better , sinking or severely damaging eight ships and receiving serious damage to only four . The naval war was concluded the next year at the Battle of Weihaiwei , where the strongest remaining Chinese ships were surrendered to the Japanese . + + = = End of the ironclad warship = = + + There is no clearly defined end to the ironclad , besides the transition from wood hulls to all @-@ metal . Ironclads continued to be used in World War I. Towards the end of the 19th century , the descriptions ' battleship ' and ' armored cruiser ' came to replace the term ' ironclad ' . + The proliferation of ironclad battleship designs came to an end in the 1890s as navies reached a consensus on the design of battleships , producing the type known as the pre @-@ Dreadnought . These ships are sometimes covered in treatments of the ironclad warship . The next evolution of battleship design , the dreadnought , is never referred to as an ' ironclad ' . + Most of the ironclads of the 1870s and 1880s served into the 1900s ( decade ) . For instance , a handful of US navy monitors laid down in the 1870s saw active service in World War I. Pre @-@ Dreadnought battleships and cruisers of the 1890s saw widespread action in World War I and in some cases through to World War II . + + = = = Legacy = = = + + The example of the ironclads had some bearing on the history of the tank , as ironclad warships became an inspiration for ideas of landships and other armored vehicles . H. G. Wells , in his short story The Land Ironclads , published in The Strand Magazine in December 1903 , described the use of large , armoured cross @-@ country vehicles , armed with cannon and machine guns , and equipped with pedrail wheels + + = = Today = = + + A number of ironclads have been preserved or reconstructed as museum ships . + Parts of USS Monitor have been recovered and are being conserved and displayed at the Mariners ' Museum in Newport News , Virginia + HMS Warrior is today a fully restored museum ship in Portsmouth , England + Huáscar is berthed at the port of Talcahuano , Chile , on display for visitors . + The City @-@ class ironclad USS Cairo is currently on display in Vicksburg , Mississippi . + Northrop Grumman in Newport News constructed a full @-@ scale replica of USS Monitor . The replica was laid down in February 2005 and completed just two months later . + The Dutch Ramtorenschip ( Coastal ram ) Zr . Ms. Buffel is currently under display in the Maritime Museum Rotterdam . + The Dutch Ramtorenschip ( Coastal ram ) Zr . Ms. Schorpioen is a museum ship at Den Helder . + The complete , recovered wooden hull of the CSS Neuse , a casemate ram ironclad , is on view in Kinston , North Carolina , and , in another part of town on the Neuse River , the recreated ship , named CSS Neuse II , is nearly built and can be visited . + The hull of the casemate ironclad CSS Jackson can be seen in the National Civil War Naval Museum at Port Columbus , Georgia . + The new United States Navy Zumwalt @-@ class guided missile destroyer has been described as bearing resemblance to ironclads . + + + = Little Gidding ( poem ) = + + Little Gidding is the fourth and final poem of T. S. Eliot 's Four Quartets , a series of poems that discuss time , perspective , humanity , and salvation . It was first published in September 1942 after being delayed for over a year because of the air @-@ raids on Great Britain during World War II and Eliot 's declining health . The title refers to a small Anglican community in Huntingdonshire , established by Nicholas Ferrar in the 17th century and scattered during the English Civil War . + The poem uses the combined image of fire and Pentecostal fire to emphasise the need for purification and purgation . According to the poet , humanity 's flawed understanding of life and turning away from God leads to a cycle of warfare , but this can be overcome by recognising the lessons of the past . Within the poem , the narrator meets a ghost that is a combination of various poets and literary figures . Little Gidding focuses on the unity of past , present , and future , and claims that understanding this unity is necessary for salvation . + + = = Background = = + + Following the completion of the third Four Quartets poem , The Dry Salvages , Eliot 's health declined and he stayed in Shamley Green , Surrey while he recovered . During this time , Eliot started writing Little Gidding . The first draft was completed in July 1941 but he was dissatisfied with it . He believed the problems with the poem lay with his own inability to write , and that , precipitated by air raids on London , he had started the poem with too little preparation and had written it too quickly . After the first draft was written , he set the poem aside , and he left in September to lecture throughout Great Britain . + After months of not working on the poem , Eliot began to feel compelled to finish it ; it was not until August 1942 , however , that he started working on it again . In total , there were five drafts . The poem was finished by 19 September 1942 and published in the October New English Weekly . Little Gidding was intended to conclude the Four Quartets series , summarising Eliot 's views expressed in this series of poems . + Little Gidding was the home of an Anglican community established in 1626 by Nicholas Ferrar . The Ferrar household lived a Christian life according to High Church principles and the Book of Common Prayer . The religious community was dispersed during the English Civil War between Parliamentarians and Royalists but reformed , ending with the death of John Ferrar in 1657 . Eliot had visited the site in May 1936 . + Unlike the other locations mentioned in the titles of the Four Quartets poems , Eliot had no direct connection to the original Christian community . As such , the community is supposed to represent almost any religious community . + + = = Poem = = + + Critics classify Little Gidding as a poem of fire with an emphasis on purgation and the Pentecostal fire . The beginning of the poem discusses time and winter , with attention paid to the arrival of summer . The images of snow , which provoke desires for a spiritual life , transition into an analysis of the four classical elements of fire , earth , air and water and how fire is the primary element of the four . Following this is a discussion on death and destruction , things unaccomplished , and regret for past events . + While using Dante 's terza rima style , the poem continues by describing the Battle of Britain . The image of warfare merges with the depiction of Pentecost , and the Holy Spirit is juxtaposed with the air @-@ raids on London . In the second section , a ghost , representing the poets of the past stuck between worlds , begins talking to the narrator of the poem . The ghost discusses change , art in general , and how humankind is flawed . The only way to overcome the problematic condition of humanity , according to the ghost , is to experience purgation through fire . The fire is described in a manner similar to Julian of Norwich 's writing about God 's love and discussed in relationship to the shirt of Nessus , a shirt that burns its wearer . Little Gidding continues by describing the eternalness of the present and how history exists in a pattern . The poem concludes by explaining how sacrifice is needed to allow an individual to die into life and be reborn , and that salvation should be the goal of humankind . + + = = Themes = = + + In terms of renewal , Eliot believed that suffering was needed for all of society before new life could begin . The original Little Gidding community was built for living on monastic lines , but the community was damaged and dispersed by Puritan forces during the English Civil War in 1646 . The church , the centre of the community , was restored in 1714 and again in 1853 . The image of religious renewal is combined with the image of the London air @-@ raids and the constant fighting and destruction within the world . This compound image is used to discuss the connection of holy places with the Holy Spirit , Pentecost , communion with the dead , and the repetition of history . The theme is also internal to Eliot 's own poems ; the image of the rose garden at the end Little Gidding is the image that begins Burnt Norton and the journey is made circular . Also , the depiction of time within the poem is similar to the way time operates within The Family Reunion . + Like the other poems making up the Four Quartets , Little Gidding deals with the past , present , and future , and humanity 's place within them as each generation is seemingly united . In the second section , there is a ghost who is the compilation of various poets , including Dante , Swift , Yeats , and others . When the ghost joins the poet , the narrator states " Knowing myself yet being someone other " . This suggests that the different times merge at the same time that the different personalities begin to merge , allowing a communication and connection with the dead . Later , in the fourth section , humanity is given a choice between the Holy Spirit or the bombing of London ; redemption or destruction . God 's love allows humankind to be redeemed and escape the living hell through purgation by fire . The end of the poem describes how Eliot has attempted to help the world as a poet . He parallels his work in language with working on the soul or working on society . + The ghost , a combination of many literary figures , was originally addressed in the poem as " Ser Brunetto " before being revised as an ambiguous " you " . " Ser Brunetto " was Dante 's way of addressing Brunetto Latini , a former mentor whom he meets in Hell to which he has been condemned for sodomy . Eliot , in a letter to John Hayward dated 27 August 1942 , explained why he changed the wording : + I think you will recognise that it was necessary to get rid of Brunetto for two reasons . The first is that the visionary figure has now become somewhat more definite and will no doubt be identified by some readers with Yeats though I do not mean anything so precise as that . However , I do not wish to take the responsibility of putting Yeats or anybody else into Hell and I do not want to impute to him the particular vice which took Brunetto there . Secondly , although the reference to that Canto is intended to be explicit , I wish the effect of the whole to be Purgatorial which is more appropriate . That brings us to the reference to swimming in fire which you will remember at the end of Purgatorio 26 where the poets are found . + The theme of swimming through flames is connected to the depiction of Guido Guinizelli , a poet that influenced Dante , seeking such a state in Purgatorio XXVI . However , the depiction of swimming was transformed into an image of dancing , an act that appears throughout Yeats 's poetry , within purgatorial flames . The critic Dominic Manganiello suggests that , in combining the image of dancing with purgation , Eliot merges Dante 's and Yeats 's poetic themes . + + = = Reception = = + + Critics such as Malcolm Cowley and Delmore Schwartz describe mixed emotions about the religiosity of the poem . Cowley emphasised the mystical nature of the poem and how its themes were closer to Buddhism than Anglicanism while mentioning his appreciation of many of the passages . Schwartz also mentioned the Buddhist images and his admiration for many of the lines in Little Gidding . F. B. Pinion believed that the fourth section of the poem costs " Eliot more trouble and vexation than any passage of the same length he ever wrote , and is his greatest achievement in the Four Quartets . " E. M. Forster did not like Eliot 's emphasis on pain and responded to the poem : " Of course there 's pain on and off through each individual 's life ... You can 't shirk it and so on . But why should it be endorsed by the schoolmaster and sanctified by the priest until the fire and the rose are one when so much of it is caused by disease and bullies ? It is here that Eliot becomes unsatisfactory as a seer . " Writing in 2003 , Roger Scruton wrote that in " Little Gidding " Eliot achieved " that for which he envies Dante — namely , a poetry of belief , in which belief and words are one , and in which the thought cannot be prized free from the controlled and beautiful language " . + + + = The Portage to San Cristobal of A.H. = + + The Portage to San Cristobal of A.H. is a 1981 literary and philosophical novella by George Steiner , in which Jewish Nazi hunters find a fictional Adolf Hitler ( A.H. ) alive in the Amazon jungle thirty years after the end of World War II . The book generated considerable controversy after its publication because in it , Steiner , who is Jewish , allows Hitler to defend himself when he is put on trial in the jungle by his captors . There Hitler maintains that Israel owes its existence to the Holocaust and that he is the " benefactor of the Jews " . + The Portage to San Cristobal of A.H. was a 1983 finalist in the PEN / Faulkner Award for Fiction . It was adapted for the theatre by British playwright Christopher Hampton and was staged in London in April 1982 with Alec McCowen playing the part of Adolf Hitler . It was also staged in Hartford , Connecticut in the United States in 1983 and starred John Cullum as Hitler . + + = = Plot summary = = + + From his base in Tel Aviv , Holocaust survivor Emmanuel Lieber directs a group of Jewish Nazi hunters in search of Adolf Hitler . Lieber believes that the former Führer is still alive , and following rumours and hearsay , he tracks Hitler 's movements through South America , until after months of wading through swamps in the Amazon jungle , the search party finds the 90 @-@ year @-@ old alive in a clearing . Lieber flies to San Cristóbal where he awaits the group 's return with their captive . But getting the old man out of the jungle alive is more difficult than getting in , and their progress is further hampered by heavy thunderstorms . + Meanwhile , broken and incoherent radio messages between Lieber and the search party are intercepted by intelligence agents tracking their progress , and rumours begin to spread across the world of Hitler 's capture . Debates flare up over his impending trial , where it will be held and under whose jurisdiction . Orosso is identified as the nearest airfield to the last known location of the search party , and aircraft begin arriving at the hitherto unknown town . But when the search party loses radio contact with Lieber , they must make a decision : do they sit out the storms and deliver their captive to Lieber later , or do they try Hitler in the jungle before their prize is snatched from them by the world at large , who they know will be waiting ? Their decision is the latter , and against Lieber 's advice ( " You must not let him speak ... his tongue is like no other " ) they prepare for a trial with a judge , prosecution and defence attorneys selected from the members of the search party . Teku , a local Indian tracker , is asked to observe the trial as an independent witness . + The attention Hitler is receiving , however , renews his strength , and when the trial begins , he brushes aside his " defence attorney " and begins a long speech in four parts in his own defence : + Firstly , Hitler claims he took his doctrines from the Jews and copied the notion of the master race from the Chosen people and their need to separate themselves from the " unclean " . " My racism is a parody of yours , a hungry imitation . " + Hitler justifies the Final Solution by maintaining that the Jews ' God , purer than any other , enslaves its subjects , continually demanding more than they can give and " blackmailing " them with ideals that cannot be attained . The " virus of utopia " had to be stopped . + Hitler states that he was not the originator of evil . " [ Stalin ] had perfected genocide when I was still a nameless scribbler in Munich . " Further , Hitler asserts that the number of lives lost due to his actions are dwarfed by various world atrocities , including those in Russia , China and Africa . + Lastly , Hitler maintains that the Reich begat Israel and suggests that he is the Messiah " whose infamous deeds were allowed by God in order to bring His people home . " He closes by asking , " Should you not honour me who have made ... Zion a reality ? " + At the end of his speech , Teku is the first to react and jumps up shouting " Proven " , only to be drowned out by the appearance of a helicopter over the clearing . + + = = Main characters = = + + Emmanuel Lieber – Jewish Holocaust survivor and director of the search party to find Hitler ; after crawling out of a death pit in Bialka he never took the time to mend and embarked on a life @-@ consuming obsession to bring those responsible for the genocide to justice . + Search party ( all Jewish with family ties to the Holocaust , except for John Asher ) + Simeon – search party leader and " presiding judge " at Hitler 's trial ; he is Lieber 's confidant and torn between leading the party into " unmapped quicksand and green bogs " and turning his back on the " quiet mania of Lieber 's conviction " . + Gideon Benasseraf – falls ill and dies before the trial begins ; during one of his fever @-@ induced ramblings he suggests that Hitler is Jewish ; he had sought out Lieber after being released from a sanatorium and spending three years recuperating in Paris where the care @-@ free living consumed him with guilt . + Elie Barach – Orthodox Jew and " prosecution attorney " at the trial ; he is the moral compass of the group , but his convictions are disturbed by Gideon Benasseraf 's fever @-@ induced assertions that Hitler is Jewish and ends up believing that Hitler may be the second Messiah . + Isaac Amsel – an 18 @-@ year @-@ old boy and witness at the trial ; he is the son of Isaac Amsel senior , former member of the search party killed earlier in a skirmish in São Paulo ; he joined the party to avenge his father 's death . + John Asher – half @-@ Jewish and reluctant " defence attorney " at the trial ; fascinated by the capture of Bormann and the rumours circulating that Hitler may be alive , he had approached Nazi hunter Wiesenthal who directed him to Lieber ; despite being an " outsider " ( no ties to the Holocaust ) Lieber assigned him to the search party because of his military training and his clear @-@ headedness ( " no metaphysical lusts , no cravings for retribution " ) . + Teku – local Indian tracker and independent witness at the trial ; previously the search party 's guide who had abandoned them when they insisted on entering uncharted regions of the jungle , he continued tracking them from a distance before revealing himself . + Adolf Hitler – now 90 years old , the former leader of the Third Reich had not died in the Führerbunker in Berlin , but escaped to South America and hid in the Amazon jungle . + + = = Background and publication = = + + George Steiner , literary critic for The New Yorker and The New York Times , had written about the Holocaust in some of his previous books , including Anno Domini ( 1964 ) , Language and Silence ( 1967 ) and In Bluebeard 's Castle ( 1971 ) . Many of the ideas Steiner expresses in The Portage to San Cristobal of A.H. were reworked from these earlier works . Steiner told New York Times editor D. J. R. Bruckner that this book arose out of his lifelong work on language . " Central to everything I am and believe and have written is my astonishment ... that you can use human speech both to bless , to love , to build , to forgive and also to torture , to hate , to destroy and to annihilate . " + Commenting on the controversy the book generated , Steiner admitted to literary journalist and critic Ron Rosenbaum ( author of Explaining Hitler ) that he too was disturbed by it , adding that his fictional Hitler had gotten the better of him , " golem- or Frankenstein @-@ like " . He said that it felt like the book " wrote me " . Steiner also pointed out that the novella is not only about his thoughts on the Holocaust , but also about the horrific events that took place in countries like Cambodia , Vietnam , El Salvador and Burundi : " My feeling is that one has to grapple with the abyss if one can . " + Steiner wrote The Portage to San Cristobal of A.H. in 1975 and 1976 in Geneva , Switzerland , and the 120 @-@ page work originally appeared in the Spring 1979 issue of the United States literary magazine , The Kenyon Review . It also appeared in the Spring 1980 issue of Granta , the British literary magazine . Its first publication in book form , with minor revisions by Steiner , was in May 1981 by Faber and Faber in the United Kingdom — and as requested by Steiner , it was a paperback original . The first United States edition was published in hardcover in April 1982 by Simon & Schuster . + + = = Adaptations = = + + The Portage to San Cristobal of A.H. was adapted for the theatre in 1982 by British playwright Christopher Hampton . It was staged in April 1982 at London 's Mermaid Theatre under the direction of John Dexter with Alec McCowen playing the part of Adolf Hitler . McCowen won the 1982 Evening Standard Theatre Award for best actor for this performance . In 1983 the production moved to the United States where it played at the Hartford Stage Company in Hartford , Connecticut , directed by Mark Lamos and starring John Cullum as Hitler . + This book is the only work of fiction by Steiner to have been adapted for the stage . + + = = Reception = = + + Reaction to The Portage to San Cristobal of A.H. was mixed . Anthony Burgess in The Observer called it " astonishing " , Christopher Booker of The Daily Telegraph described it as a " powerful piece " , and English author A. S. Byatt said it was a " masterpiece " . In Explaining Hitler , Ron Rosenbaum called The Portage " A Frankenstein story " , referring to Steiner 's fictive Hitler has having taken on a life of its own . Writing in Time magazine , Otto Friedrich described the book " a philosophic fantasy of remarkable intensity " , adding that by not refuting Hitler 's speech , Steiner deviates from the horrors of traditional Holocaust literature and ends the book " on a note of bleak ambiguity " . + Morris Dickstein of The New York Times was more critical of the book , calling it " a misconceived and badly executed novel , a sideshow distraction from the serious business of thinking through the unspeakable horrors of the Nazi era . " He described it as " wearisome " that is " suffocate [ d ] " by too much " fine writing " ( belles @-@ lettres ) . He also complained that the characters are lifeless , and while they each have detailed histories , they are only " verbal figments " that do not separate them from one another . Finally Dickstein noted that because almost all the points of Hitler 's speech are drawn from some of Steiner 's earlier works , he " unwittingly creates sympathy for Hitler by making him old and pathetic yet also lucid and brilliant — at once absurdly harmless and unconvincingly dangerous . " + In another review in The New York Times John Leonard wrote that while the book has its strong points , " some wit , a catholic disdain , multiplicity of character and a South American swamp @-@ life that terrifies " , its weaknesses are that " the characters are really ideas , ... the symbols clash and there are too many echoes of better books by Kafka and Proust " . But Leonard 's biggest criticism of the book was Hitler 's speech , which he called " obscene " , and Steiner 's decision to end the book at that point , which Leonard said " not only denies the power of art to arrange and transcend , but ... makes me sick to my stomach . " + Writing in the American literary magazine Salmagundi , Alvin H. Rosenfeld called The Portage a " breakthrough work " that " astonishes " . He was struck by the book 's interplay between the landscape of swamp and jungle , and the " landscape of speech " — the former being " brilliantly registered " with its " immense feeling of physicality " , and the latter , " even more dramatic " in the way it exposes " the dark underside of words " and how its use and misuse reveals the true nature of a person . He was particularly impressed by the depiction of Nazi hunter Emmanuel Lieber and his role as representative of the Jewish consciousness . Rosenfeld noted that while Holocaust literature often either soars to " expostulation and apostrophe " , or sinks to " a dwindling sob of elegiac lament " , Steiner 's Lieber " mediates between these two extremes , ... simultaneously records and mourns , coldly enumerates yet carries an immense affect " . What did concern the reviewer , however , was the way Steiner used ideas from his earlier works , that he had put them " virtually verbatim " into Hitler 's mouth , creating the impression that " Steiner 's understanding of Hitler were identical with the latter 's self @-@ understanding " . Rosenfeld also questioned why the book had to end with Hitler 's speech . He said that Steiner 's fictive Hitler plays " the devil 's game of language subversion " , making " madness [ sound ] like music " , something the real Hitler had perfected . By stopping at this point , Rosenfeld felt that Steiner " succumb [ s ] , rhetorically , to the seductive eloquence of negation " , which undermines his own " high standards of moral intelligence " . But overall Rosenfeld said The Portage " must be counted among the most vigorous attempts to portray the presence and meaning of Hitler " , forcing us to confront him " in a way hardly seen before in fiction " . + The Portage to San Cristobal of A.H. was a finalist in the 1983 PEN / Faulkner Award for Fiction . + + = = Controversy = = + + The book generated considerable controversy because of its apparent " admiration for Hitler " . The controversy grew further when the faithful stage adaptation ( " too faithful " , according to Steiner ) was performed in the United Kingdom and the United States . + Hitler 's speech at the end of the book disturbed many readers and critics . Steiner not only lets Hitler justify his past , he allows him the ( almost ) last word before the outside world invades . The fact that Steiner is Jewish made this speech in particular even more contentious . One critic , while acknowledging that Steiner always saw Hitler as " the incarnation of unprecedented and unparalleled evil " , felt that there was no clear distinction in the book between Steiner 's own views and those of his fictional Hitler , even going so far as to accuse Steiner , who rejects Jewish nationalism and is a critic of Israel 's treatment of the Palestinians , of anti @-@ Semitism . + In contrast , a Time magazine article at the time felt that Steiner 's intention for the Hitler speech was to use it to explore his previously stated belief " that Hitler wielded language as an almost supernatural force " , drawing attention to Nazi hunter Emmanuel Lieber 's warning from the book regarding Hitler : " There shall come a man who ... will know the grammar of hell and teach it to others . He will know the sounds of madness and loathing and make them seem music . " + Steiner responded to criticism that Hitler 's speech in this book is unchallenged by saying that it had been done before : for example Satan 's speech in Milton 's Paradise Lost ( 1667 ) , and The Grand Inquisitor 's speech in Dostoyevsky 's The Brothers Karamazov ( 1880 ) . He also reminded the reader that Hitler 's speech is balanced out earlier in the book by Lieber 's long monologue on the horrors of the Holocaust . Finally , Steiner said that his Hitler ( A. H. ) is " a fictive figure " , and that it is not he who has the last word , but Teku , the Indian tracker , who shouts " Proven " . Teku is also the Hebrew word used to indicate that " there are issues here beyond our wisdom to answer or decide . " + + + = Temnospondyli = + + Temnospondyli ( from Greek τέμνειν ( temnein , " to cut " ) and σπόνδυλος ( spondylos , " vertebra " ) ) is a diverse subclass of extinct small to giant tetrapods — often considered primitive amphibians — that flourished worldwide during the Carboniferous , Permian , and Triassic periods . A few species continued into the Cretaceous . Fossils have been found on every continent . During about 210 million years of evolutionary history , they adapted to a wide range of habitats , including fresh water , terrestrial , and even coastal marine environments . Their life history is well understood , with fossils known from the larval stage , metamorphosis , and maturity . Most temnospondyls were semiaquatic , although some were almost fully terrestrial , returning to the water only to breed . These temnospondyls were some of the first vertebrates fully adapted to life on land . Although temnospondyls are considered amphibians , many had characteristics , such as scales , claws , and armour @-@ like bony plates , that distinguish them from modern amphibians . + Temnospondyls have been known since the early 19th century , and were initially thought to be reptiles . They were described at various times as batrachians , stegocephalians , and labyrinthodonts , although these names are now rarely used . Animals now grouped in Temnospondyli were spread out among several amphibian groups until the early 20th century , when they were found to belong to a distinct taxon based on the structure of their vertebrae . Temnospondyli means " cut vertebrae " , as each vertebra is divided into several parts . + Experts disagree over whether temnospondyls were ancestral to modern amphibians ( frogs , salamanders , and caecilians ) , or whether the whole group died out without leaving any descendants . Different hypotheses have placed modern amphibians as the descendants of temnospondyls , another group of early tetrapods called lepospondyls , or even as descendants of both groups ( with caecilians evolving from lepospondyls and frogs and salamanders evolving from temnospondyls ) . Recent studies place a family of temnospondyls called the amphibamids as the closest relatives of modern amphibians . Similarities in teeth , skulls , and hearing structures link the two groups . + + = = Description = = + + Many temnospondyls are much larger than living amphibians , and superficially resemble crocodiles . Others are smaller and resemble salamanders . Most have broad , flat heads that are either blunt ( brevirostrine ) or elongated ( longirostrine ) . The skulls are rounded or triangular in shape when viewed from above , and are usually covered in pits and ridges . The rugged surfaces of bones may have supported blood vessels , which could transfer carbon dioxide to the bones to neutralize acidic build up in the blood ( early semiaquatic tetrapods would have had difficulty expelling carbon dioxide from their bodies while on land , and these dermal bones may have been an early solution to the problem ) . Many temnospondyls also have canal @-@ like grooves in their skulls called sensory sulci . The sulci , which usually run around the nostrils and eye sockets , are part of a lateral line system used to detect vibrations in water . As semiaquatic animals , most temnospondyls have small limbs with four toes on each front foot and five on each hind foot . Terrestrial temnospondyls have larger , thicker limbs , and some even have claws . One unusual terrestrial temnospondyl , Fayella , has relatively long limbs for its body , and probably lived as an active runner able to chase prey . + Homologues of most of the bones of temnospondyls are also seen in other early tetrapods , aside from a few bones in the skull , such as interfrontals , internasals , and interparietals , that have developed in some temnospondyl taxa . Most temnospondyls have tabular horns in the backs of their skulls , rounded projections of bone separated from the rest of the skull by indentations called otic notches ; in some temnospondyls , such as Zatrachys , they are pointed and very prominent . Among the most distinguishing features of temnospondyls are the interpterygoid vacuities , two large holes in the back of the palate . Another pair of holes , choanae , are present in front of these vacuities , and connect the nasal passage with the mouth . Temnospondyls often have teeth on their palates , as well as in their jaws . Some of these teeth are so large , they are referred to as tusks . In some temnospondyls , such as Nigerpeton , tusks in the lower jaw pierce the palate and emerge through openings in the top of the skull . + Very little is known of the soft tissue of temnospondyls . A block of sandstone , described in 2007 from the Early Carboniferous Mauch Chunk Formation of Pennsylvania , included impressions of the bodies of three temnospondyls . These impressions show , when alive , they had smooth skin , robust limbs with webbed feet , and a ridge of skin on their undersides . Trackways referable to small temnospondyls have also been found in Carboniferous and Permian rocks . The trackways , called batrachichni , are usually found in strata deposited around freshwater environments , suggesting the animals had some ties to the water . + Unlike modern amphibians , many temnospondyls are covered in small , closely packed scales . The undersides of most temnospondyls are covered in rows of large ventral plates . During early stages of development , they first have only small , rounded scales . Fossils show , as the animals grew , the scales on the undersides of their bodies developed into large , wide ventral plates . The plates overlap each other in a way that allows a wide range of flexibility . Later semiaquatic temnospondyls , such as trematosaurs and capitosaurs , have no evidence of scales . They may have lost scales to make movement easier under water or to allow cutaneous respiration , the absorption of oxygen through the skin . + Several groups of temnospondyls have large bony plates on their backs . One temnospondyl , Peltobatrachus , has armour @-@ like plating that covers both its back and underside . The temnospondyl Laidleria also has extensive plating on its back . Most members of the family Dissorophidae also have armor , although it only covers the midline of the back with two narrow rows of plates . Other temnospondyls , such as Eryops , have been found with small , disc @-@ like bony scutes that were in life probably embedded in the skin . All of these temnospondyls were adapted to a terrestrial lifestyle . Armor may have offered protection from predators in the case of Peltobatrachus . The scutes may have provided stability for the spine , as they would have limited flexibility and may have been connected by strong ligaments . Temnospondyls such as Sclerothorax and Eryops that may have been at least partly terrestrial also have long neural spines on top of their vertebrae that would have stabilized the spine . Bony scutes are also seen in plagiosaurs , but unlike Peltobatrachus , Laidleria , Eryops , and dissorophids , these animals are thought to have been fully aquatic . Plagiosaurs may have inherited their armor from a terrestrial ancestor , as both Peltobatrachus and Laidleria have been considered close relatives of the group . + Temnospondyls ' vertebrae are divided into several segments . In living tetrapods , the main body of the vertebra is a single piece of bone called the centrum , but in temnospondyls , this region was divided into a pleurocentrum and intercentrum . Two types of vertebrae are recognized in temnospondyls : stereospondylous and rhachitomous vertebrae . In rhachitomous vertebrae , the intercentra are large and wedge @-@ shaped , and the pleurocentra are relatively small blocks that fit between them . Both elements support a spine @-@ like neural arch , and well @-@ developed interlocking projections called zygapophyses strengthen the connections between vertebrae . The strong backbone and strong limbs of many ratchitomous temnospondyls allowed them to be partially , and in some cases fully , terrestrial . In stereospondylous vertebrae , the pleurocentra have been lost entirely , with the intercentra enlarged as the main body of the vertebrae . This weaker type of backbone indicates the stereospondylous temnospondyls spent more time in water . + + = = History of study = = + + Temnospondyli was named by German palaeontologist Karl Alfred von Zittel in his second edition of Handbuch der Palaeontologie , published in 1888 . Temnospondyl remains were known since the early part of the 19th century , however . The earliest described temnospondyl was Mastodonsaurus , named by Georg Friedrich Jaeger in 1828 . Jaeger named Mastodonsaurus from a single tooth , and considered it a reptile . Mastodonsaurus means " breast tooth lizard " after the nipple @-@ like shape of the tip of the tooth . + The naming of these first specimens was disputed , however . Leopold Fitzinger named the animal Batrachosaurus in 1837 . In 1841 , English palaeontologist Richard Owen referred to the genus as Labyrinthodon to describe its highly folded or labyrinthine teeth . Owen thought the name Mastodonsaurus " ought not to be retained , because it recalls unavoidably the idea of the mammalian genus Mastodon , or else a mammilloid form of the tooth ... and because the second element of the word , saurus , indicates a false affinity , the remains belonging , not to the Saurian , but to the Batrachian order of Reptiles . " Owen recognized the animal was not a " saurian " reptile , yet he also referred Jaeger 's Phytosaurus to the genus . Although the two genera both have similarly sized conical teeth , Phytosaurus was later found to be a crocodile @-@ like reptile . Additional material , including skulls , firmly placed Labyrinthodon as an amphibian . Jaeger also named Salamandroides giganteus in 1828 , basing it on partial occiput , or back portion of the skull . In 1833 , he described a complete skull of S. giganteus that had the same teeth as his Mastodonsaurus , making it the first known complete skull of a temnospondyl . Because Mastodonsaurus was named first , it has precedence over the other names as a senior subjective synonym . Batrachosaurus is still used as the name of an unrelated brachyopid temnospondyl . + Mastodonsaurus and other similar animals were referred to as labyrinthodonts , named like Labyrinthodon for teeth that were highly folded in cross section . Owen 's " Labyrinthodon Jaegeri " was later found at Guy 's Cliffe , England by paleontologist William Buckland . Other specimens were found in the red sandstone of Warwickshire . As more fossils were uncovered in England , Owen depicted these labyrinthodonts as the " highest " form of batrachian and compared them to crocodiles , which he considered the highest form of reptiles . He also noted the large labyrinthodonts of the Keuper ( a unit of rocks that dates to the Late Triassic ) were younger than more advanced reptiles in the Magnesian and Zechstein , which are Late Permian in age . Owen used these fossils to counter the notion that reptiles evolved from a sequential progression from early amphibians ( what he called " metamorphosed fishes " ) . + In addition to Mastodonsaurus , some of the earliest named genera included Metopias and Rhombopholis in 1842 , Zygosaurus in 1848 , Trematosaurus in 1849 , Baphetes and Dendrerpeton in 1853 , Capitosaurus in 1858 , and Dasyceps in 1859 . Baphetes is now placed as an early tetrapod outside Temnospondyli , and Rhombopholis is now considered a prolacertiform reptile . + Later in the 19th century , temnospondyls were classified as various members of Stegocephalia , a name coined by American paleontologist Edward Drinker Cope in 1868 . Cope placed stegocephalians in the class Batrachia , the name then used for Amphibia . Stegocephalia means " roof @-@ headed " in Greek , a reference to the wide , flat heads of temnospondyls and other early tetrapods . During this time , palaeontologists considered temnospondyls to be amphibians because they possessed three main features : gill arches in juvenile skeletons , indicating they were amphibious for at least the first part of their lives ; ribs that do not connect at the underside of the rib cage ; and deep pits in the skull that were interpreted as space for mucous glands . + Several suborders of stegocephalians were recognized in the late 19th and early 20th centuries . Animals now regarded as temnospondyls were primarily labyrinthodonts , but some were classified in the Branchiosauria . Branchiosaurs were small @-@ bodied and had simple conical teeth , while labyrinthodonts were larger and had complex , folded dentin and enamel in their teeth . Branchiosauria included only a few forms , such as Branchiosaurus from Europe and Amphibamus from North America , that had poorly developed bones , external gills , and no ribs . Some skeletons of Amphibamus were later found with long ribs , prompting its reassignment to Microsauria ( although more detailed studies found it to be a temnospondyl ) . Soft tissue , such as scales and external gills , were found in many well @-@ preserved branchiosaur fossils from Germany . In the early 20th century , branchiosaurs would be recognized as larval forms of temnospondyls lacking many of the typical features that define the group , and is no longer recognized as a distinct group . + Other animals that would later be classified as temnospondyls were placed in a group called Ganocephala , characterized by plate @-@ like skull bones , small limbs , fish @-@ like scales , and branchial arches . Unlike labyrinthodonts , they did not have parietal foramena , small holes in their skulls behind their eye sockets . Archegosaurus , Dendrerpeton , Eryops and Trimerorhachis were placed in this group and were considered to be the most primitive members of Reptilia . Their rhachitomous vertebrae , notochord , and lack of occipital condyles ( which attached the head to the neck ) were features that were also shared with fishes . Thus , they were considered a link between early fishes and more advanced forms such as stegocephalians . + Another group called Microsauria was named by Cope in 1868 . Cope classified Microsauria as a subgroup of Labyrinthodontia , placing many small , amphibian @-@ like animals within it . Among them were Dendrerpeton , once placed in Ganocephala . Dendrerpeton was later placed as a labyrinthodont with other temnospondyls , but confusion existed for many years over the classification of small amphibians . + By the end of the 19th century , most of what are today regarded as temnospondyls were placed in the suborder Labyrinthodonta . American paleontologist Ermine Cowles Case called it Labyrinthodonta vera or " true labyrinthodonts " . The names Stegocephalia and Labyrinthodontia were used interchangeably to refer to the order in which it belonged . The labyrinthodontian suborders Microsauria and Branchiosauria , both of which contain temnospondyls , were distinct from Labyrinthodonta . Within Labyrinthodonta were the groups Rhachitomi , Labyrinthodonti , and Embolerimi . Members of Rhachitomi , such as Archegosaurus and Eryops , had rhachitomous vertebrae with enlarged intercentra that displaced the pleurocentra . Labyrinthodonti , such as Mastodonsaurus , Trematosaurus , and Micropholis , had lost their pleurocentra , and the intercentra made up the entire body of the vertebrae . Embolerimi had intercentra and pleurocentra that were of equal size . Embolomeres are now identified as reptiliomorphs distantly related to temnospondyls . + In 1888 , von Zittel divided stegocephalians among three taxa : Lepospondyli , Temnospondyli , and Stereospondyli . He placed microsaurs in Lepospondyli , a group which he characterized as having simple , spool @-@ shaped vertebral centra . Temnospondyli included forms with the centra divided into pleurocentra and intercentra . All members of Stereospondyli had amphicoelous centra composed only of the intercentra . Cope objected to von Zittel 's classification , considering the vertebrae of lepospondyls and stereospondyls indistinguishable because each had a simple spool shape . He continued to use Ganocephala and Labyrinthodonta ( which he alternatively referred to as Rhachitomi ) to distinguish animals based on the absence or presence of occipital condyles . + Temnospondyli became a commonly used name at the turn of the century . Paleontologists included both embolomeres and rhachitomes in the group . Cope 's Ganocephala and Labyrinthodonta fell out of use . In 1919 , British paleontologist D. M. S. Watson proposed that the evolutionary history of these large amphibians could be seen through changes in their vertebrae . Embolomerous forms in the Carboniferous graded into rhachitomous forms in the Permian , and finally into stereospondyls in the Triassic . More importantly , Watson began using the term Labyrinthodontia to refer to these groups . The name Temnospondyli was rarely used in the decades that followed . Swedish paleontologist Gunnar Säve @-@ Söderbergh removed embolomeres from the group , narrowing its scope to rhachitomes and stereospondyls . His classification of labyrinthodonts was based heavily on characteristics of the skull rather than the vertebrae . + American paleontologist Alfred Romer brought the name Temnospondyli back into use in the later 20th century . Säve @-@ Söderbergh used the name Labyrinthodontia in a strict sense ( sensu stricto ) to refer to Rhachitomi and Stereospondyli , excluding Embolomeri . Romer agreed with this classification , but used the name Temnospondyli to avoid confusion with Labyrinthodontia in its wider sense ( sensu lato ) . Unlike modern temnospondyl classification , however , Romer included the primitive Ichthyostegalia in the group . + + = = Evolutionary history = = + + + = = = Carboniferous and Early Permian = = = + + Temnospondyls first appeared in the Early Carboniferous around 330 million years ago ( Mya ) . During the Carboniferous , temnospondyls included basal medium @-@ sized forms such as Dendrerpeton or large semiaquatic forms such as Cochleosaurus . Other , more derived temnospondyls , such as the amphibamids , were smaller and more terrestrial . They resembled salamanders , and some taxa , such as the genus Branchiosaurus , even retained external gills like the modern @-@ day axolotl . During the latest Carboniferous and Early Permian around 300 Mya , several groups , such as the dissorophids and trematopids evolved strong , robust limbs and vertebrae and became adapted to life on land while others such as the eryopids , developed into large semiaquatic predators . The dvinosaurs , a group of small aquatic temnospondyls , evolved from terrestrial ancestors in the Late Carboniferous . + + = = = Late Permian = = = + + During the Late Permian , increasing aridity and the diversification of reptiles contributed into a decline in terrestrial temnospondyls , but semiaquatic and fully aquatic temnospondyls continued to flourish , including the large Melosaurus of Eastern Europe . Other temnospondyls , such as archegosaurids , developed long snouts and a close similarity to crocodiles , although they lacked the armor characteristic of the latter group . These temnospondyls included the largest known amphibian , the 9 @-@ m @-@ long Prionosuchus of Brazil . + + = = = Mesozoic = = = + + As temnospondyls continued to flourish and diversify in the Late Permian ( 260 @.@ 4 - 251 @.@ 0 Mya ) , a major group called Stereospondyli became more dependent on life in the water . The vertebrae became weak , the limbs small , and the skull large and flat , with the eyes facing upwards . During the Triassic period , these animals dominated the freshwater ecosystems , evolving in a range of both small and large forms . During the Early Triassic ( 251 @.@ 0 - 245 @.@ 0 Mya ) one group of successful long @-@ snouted fish @-@ eaters , the trematosauroids , even adapted to a life in the sea , the only known amphibians to do so with the exception of the modern crab @-@ eating frog . Another group , the capitosauroids , included medium- and large @-@ sized animals 2 @.@ 3 to 4 m ( 7 @.@ 5 to 13 @.@ 1 ft ) in length , with large and flat skulls that could be over a meter long in the largest forms such as Mastodonsaurus . These animals spent most or all their lives in water as aquatic predators , catching their prey by a sudden opening of the upper jaw and sucking in fish or other small animals . + In the Carnian stage of the Late Triassic ( 228 @.@ 0 - 216 @.@ 5 Mya ) , capitosauroids were joined by the superficially very similar Metoposauridae . Metoposaurids are distinguished from capitosauroids by the positioning of their eye sockets near the front of their skulls . Another group of stereospondyls , the plagiosaurs , had wide heads with external gills , and adapted to life at the bottom of lakes and rivers . By this time , temnospondyls had become a common and widespread component of semiaquatic ecosystems . Some temnospondyls , such as Cryobatrachus and Kryostega , even inhabited Antarctica , which was covered in temperate forests at the time . + Triassic temnospondyls were often the dominant semiaquatic animals in their environments . Large assemblages of metoposaurs with hundreds of individuals preserved together have been found in the southwestern United States . They have often been interpreted as mass death events caused by droughts in floodplain environments . Recent studies show these dense assemblages were instead probably the result of currents accumulating dead individuals in certain areas . These environments seem to have had little diversity , as they were inhabited almost exclusively by metoposaurs . + The Triassic @-@ Jurassic extinction event around 199 @.@ 6 Mya led to the extinction of most Mesozoic temnospondyls . The brachyopoids survived , as well as a few capitosauroids and trematosauroids . While the latter two groups soon became extinct , brachyopoids persisted and grew to large sizes during the Jurassic . Among brachyopoids , the brachyopids flourished in China and the chigutisaurids became common in Gondwana . The most recent known temnospondyl was the giant chigutisaurid Koolasuchus , known from the Early Cretaceous of Australia . It survived in rift valleys that were too cold in the winter for crocodiles that normally would have competed with them . Koolasuchus was one of the largest of the brachyopoids , with an estimated weight of 500 kg ( 1 @,@ 100 lb ) . + + = = Classification = = + + Originally , temnospondyls were classified according to the structure of their vertebrae . Early forms , with complex vertebrae consisting of a number of separate elements , were placed in the suborder Rachitomi , and large Triassic aquatic forms with simpler vertebrae were placed in the suborder Stereospondyli . With the recent growth of phylogenetics , this classification is no longer viable . The basic rhachitomous condition is found in many primitive tetrapods , and is not unique to one group of temnospondyls . Moreover , the distinction between rhachitomous and stereospondylous vertebrae is not entirely clear . Some temnospondyls have rhachitomous , semirhachitomous , and sterospondylous vertebrae at different points in the same vertebral column . Other taxa have intermediate morphologies that do not fit into any category . Rachitomi is no longer recognized as a group , but Stereospondyli is still considered valid . Below is a simplified taxonomy of temnospondyls showing currently recognized groups : + Class Amphibia + Order Temnospondyli + Superfamily Edopoidea + Family Cochleosauridae ( Chenoprosopidae ) + Family Edopidae + Family Dendrerpetontidae + Suborder Euskelia + Superfamily Dissorophoidea + Family Amphibamidae + Family Branchiosauridae + Family Dissorophidae + Family Micromelerpetontidae + Superfamily Eryopoidea + Family Eryopidae + Family Parioxyidae + Family Zatrachydidae + Clade Limnarchia + Clade Stereospondylomorpha + Superfamily Archegosauroidea + Family Actinodontidae + Family Archegosauridae + Family Intasuchidae ( placement is uncertain ) + Family Sclerocephalidae + Suborder Stereospondyli + Family Peltobatrachidae + Family Lapillopsidae + Family Rhinesuchidae + Family Lydekkerinidae + Clade Capitosauria + Superfamily Mastodonsauroidea ( Capitosauroidea ) + Family Heylerosauridae + Family Mastodonsauridae + Family Stenotosauridae + Infraorder Trematosauria + Superfamily Trematosauroidea + Superfamily Metoposauroidea + Superfamily Plagiosauroidea + Superfamily Brachyopoidea + Superfamily Rhytidosteoidea + + = = = Phylogeny = = = + + In one of the earliest phylogenetic analyses of the group , Gardiner ( 1983 ) recognized five characteristics that made Temnospondyli a clade : a bone at the back of the skull , the parasphenoid , is connected to another bone on the underside of the skull , the pterygoid ; large openings called interpterygoid vacuities are present between the pterygoids ; the stapes ( a bone involved in hearing ) is connected to the parasphenoid and projects upward ; the cleithrum , a bone in the pectoral girdle , is thin ; and part of the vertebra called the interdorsal attaches to the neural arch . Additional features were given by Godfrey et al . ( 1987 ) , including the contact between the postparietal and exoccipital at the back of the skull , small projections ( uncinate processes ) on the ribs , and a pelvic girdle with each side having a single iliac blade . These shared characteristics are called synapomorphies . + Temnospondyls are placed as basal tetrapods in phylogenetic analyses , with their exact positioning varying between studies . Depending on the classification of modern amphibians , they are either included in the crown group Tetrapoda or the stem of Tetrapoda . Crown @-@ group tetrapods are descendants of the most recent common ancestor of all living tetrapods and stem tetrapods are forms that are outside the crown group . Modern amphibians have recently been suggested as descendants of temnospondyls , which would place them within crown Tetrapoda . Below is a cladogram from Ruta et al . ( 2003 ) placing Temnospondyli within crown Tetrapoda : + Other studies place modern amphibians as the descendants of lepospondyls and place temnospondyls in a more basal position within the stem of Tetrapoda . Below is a cladogram from Laurin and Reisz ( 1999 ) placing Temnospondyli outside crown Tetrapoda : + Most phylogenetic analyses of temnospondyl interrelationships focus on individual families . One of the first broad @-@ scale studies of temnospondyl phylogeny was conducted by paleontologist Andrew Milner in 1990 . A 2007 study made a " supertree " of all temnospondyl families , combining the family @-@ level trees of previous studies . The following cladogram is modified from Ruta et al . ( 2007 ) : + 1 Temnospondyli , 2 Edopoidea , 3 Dvinosauria , 4 Euskelia , 5 Eryopoidea , 6 Dissorophoidea , 7 Limnarchia , 8 Archegosauroidea , 9 Stereospondyli , 10 Rhytidostea , 11 Brachyopoidea , 12 Capitosauria , 13 Trematosauria , 14 Metoposauroidea + The most basal group of temnospondyls is the superfamily Edopoidea . Edopoids have several primitive or plesiomorphic features , including a single occipital condyle and a bone called the intertemporal that is absent in other temnospondyls . Edopoids include the Late Carboniferous genus Edops and the family Cochleosauridae . Dendrerpetontidae has also been included in Edopoidea , and is the oldest known temnospondyl family . Balanerpeton woodi is the oldest species , having been present over 330 million years ago during the Viséan stage of the Early Carboniferous . Recent analyses place Dendrerpetontidae outside Edopoidea in a more derived position . Other primitive temnospondyls include Capetus and Iberospondylus . Saharastega and Nigerpeton , both described in 2005 from Niger , are also primitive yet come from the Late Permian . They are almost 40 million years younger than other basal temnospondyls , implying a long ghost lineage of species that are not yet known in the fossil record . + In 2000 , paleontologists Adam Yates and Anne Warren produced a revised phylogeny of more derived temnospondyls , naming several new clades . Two major clades were Euskelia and Limnarchia . Euskelia includes the temnospondyls that were once called rhachitomes and includes two subfamilies , the Dissorophoidea and the Eryopoidea . Dissorophoids include small , mostly terrestrial temnospondyls that may be the ancestors of modern amphibians . Eryopoids include larger temnospondyls like Eryops . The second major clade , Limnarchia , includes most Mesozoic temnospondyls , as well as some Permian groups . Within Limnarchia are the superfamily Archegosauroidea and the most derived temnospondyls , the stereospondyls . + Yates and Warren also named Dvinosauria , a clade of small aquatic temnospondyls from the Carboniferous , Permian , and Triassic . They placed Dvinosauria within Limnarchia , but more recent studies disagree on their position . For example , a 2007 study places them even more basal than euskelians , while a 2008 study keeps them as basal limnarchians . + Within Stereospondyli , Yates and Warren erected two major clades : Capitosauria and Trematosauria . Capitosaurs include large semiaquatic temnospondyls like Mastodonsaurus with flat heads and eyes near the back of the skull . Trematosaurs include a diversity of temnospondyls , including large marine trematosauroids , aquatic plagiosaurs , brachyopoids that survived into the Cretaceous , and metoposauroids with eyes near the front of their heads . In 2000 , paleontologists Rainer Schoch and Andrew Milner named a third major clade of stereospondyls , the Rhytidostea . This group included more primitive stereospondyls that could not be placed in either Capitosauria or Trematosauria , and included groups like Lydekkerinidae , Rhytidosteidae , and Brachyopoidea . While Capitosauria and Trematosauria are still widely used , Rhytidostea is not often supported as a true clade in recent analyses . Rhytidosteids and brachyopoids are now grouped with trematosaurians , but lydekkerinids are still considered to be a primitive family of stereospondyls . + A new phylogeny of temnospondyls was offered by paleontologist Rainer Schoch in 2013 . It supported many of the clades that were found by Yates and Warren , but it did not find support for their division of derived stereospondyls into Euskelia and Limnarchia . Eryopids were found to be more closely related to stereospondyls than to dissorophoids , which were grouped with dvinosaurs . The clade including Eryopidae and Stereospondylomorpha was named Eryopiformes . In addition , Schoch named the clade containing all temnospondyls except edopoids Eutemnospondyli and reinstated the name Rhachitomi for the clade containing all temnospondyls except edopoids and dendrerpetontids . Below is the cladogram from Schoch 's analysis : + + = = = Relationship to modern amphibians = = = + + Modern amphibians ( frogs , salamanders , and caecilians ) are classified in Lissamphibia . Lissamphibians appear to have arisen in the Permian . Molecular clock estimates place the first lissamphibian in the Late Carboniferous , but the first member of Batrachia ( frogs and salamanders , but not caecilians ) is estimated to have appeared in the Middle Permian using the same technique . Using fossil evidence , there are three main theories for the origin of modern amphibians . + One is that they evolved from dissorophoid temnospondyls . Another is that they evolved from lepospondyls , most likely the lysorophians . A third hypothesis is that caecilians descended from lepospondyls and frogs and salamanders evolved from dissorophoids . + Recently , the theory that temnospondyls were the ancestors of all lissamphibians has gained wide support . The skull morphology of some small temnospondyls has been compared to those of modern frogs and salamanders , but the presence of bicuspid , pedicellate teeth in small , paedomorphic or immature temnospondyls has been cited as the most convincing argument in favor of the temnospondyl origin of lissamphibians . Seen in lissamphibians and many dissorophoid temnospondyls , pedicellate teeth have calcified tips and bases . During the development of most tetrapods , teeth begin to calcify at their tips . Calcification normally proceeds downward to the base of the tooth , but calcification from the tip stops abruptly in pedicellate teeth . Calcification resumes at the base , leaving an area in the center of the tooth uncalcified . This pattern is seen in living amphibians and fossils . + The dissorophoid family Amphibamidae is thought to be most closely related to Lissamphibia . In 2008 , an amphibamid called Gerobatrachus hottoni was named from Texas and was nicknamed the " frogamander " for its frog @-@ like head and salamander @-@ like body . It was thought to be the most closely related temnospondyl to lissamphibians and was placed as the sister taxon of the group in a phylogenetic analysis . Another species of amphibamid called Doleserpeton annectens is now thought to be even more closely related to lissamphibians . Unlike Gerobatrachus , Doleserpeton was known since 1969 , and the presence of pedicellate teeth in its jaws has led some paleontologists to conclude soon after its naming that it was a relative of modern amphibians . It was first described as a " protolissamphibian " , and the specific name annectens means " connecting " in reference to its inferred transitional position between temnospondyls and lissamphibians . The structure of its tympanum , a disk @-@ like membrane that functions like an ear drum , is similar to that of frogs and has also been used as evidence for a close relationship . Other features including the shape of the palate and the back of the skull , the short ribs , and the smooth skull surface also point to it being a closer relative of lissamphibians than is Gerobatrachus . Below is a cladogram modified from Sigurdsen and Bolt ( 2010 ) showing the relationships of Gerobatrachus , Doleserpeton , and Lissamphibia : + + = = Paleobiology = = + + + = = = Feeding = = = + + Although the earliest temnospondyls were primarily semiaquatic , they had the ability to feed on land . Later , eryopoids and dissorophoids , some well adapted to terrestrial life , also fed on land . Some eryopoids became better adapted toward life in water , and shifted their diets toward aquatic organisms . The first primarily aquatic feeders were archegosaurs in the Permian . Trematosaurs and capitosaurs became independently aquatic and also returned to this type of feeding . + Most aquatic stereospondyls have flattened heads . When feeding , they probably opened their mouths by lifting their skulls instead of lowering their lower jaws . The jaw mechanics of the plagiosaurid Gerrothorax is well known , and is one of the most highly adapted . Gerrothorax is thought to have lifted its skull to around 50 ° above horizontal through the flexing of the atlanto @-@ occipital joint between the occipital condyles of the skull and the atlas vertebra of the neck . As the skull is raised , the quadrate bone pushes forward and causes the lower jaw to protrude outward . Other stereospondyls probably also lifted their skulls , but they are not as well adapted for such movement . D.M.S. Watson was the first to suggest skull lifting as a means of feeding in temnospondyls . He envisioned that Mastodonsaurus , a much larger temnospondyl than Gerrothorax , was able to make the same movement . Paleontologist A.L. Panchen also supported the idea in 1959 , suggesting that Batrachosuchus also fed in this way . At the time it was thought that these temnospondyls lifted their heads with strong jaw muscles , but it is now thought that they used larger muscles in the neck that were attached to the large pectoral girdle . Plagiosuchus , a close relative of Gerrothorax , also has a hyobranchial skeleton that muscles may have attached to . Plagiosuchus has very small teeth and a large area for muscle attachment behind the skull , suggesting that it could suction feed by rapidly opening its mouth . + Unlike semiaquatic temnospondyls , terrestrial temnospondyls have skulls that are adapted for biting land @-@ living prey . The sutures between the bones of the skull in the dissorophoid Phonerpeton are able to withstand a high degree of compression . Compressive forces would have been experienced when biting down on prey . Earlier aquatic tetrapods and tetrapod ancestors differ from temnospondyls like Phonerpeton in that their skulls were also built to withstand tension . This tension would have been experienced during suction feeding underwater . Temnospondyls like Phonerpeton were among the first tetrapods that were almost exclusively terrestrial and fed by biting . + + = = = Reproduction = = = + + Temnospondyls , like all amphibians , reproduced in aquatic environments . Most temnospondyls probably reproduced through external fertilization . Like most living frogs , female temnospondyls would have laid masses of eggs in water while males released sperm to fertilize them . Several fossils were described from the Early Permian of Texas in 1998 that may be egg masses of dissorophoid temnospondyls . They were the first known fossils of amphibian eggs . The fossils consist of small disks with thin membranes that are probably vitelline membranes and halo @-@ like areas surrounding them that are most likely mucous coatings . They are attached to plant fossils , suggesting that these temnospondyls laid eggs on aquatic plants much like modern frogs . The mucous membranes show that the eggs were laid by amphibians , not fish ( their eggs lack mucous ) , but the type of amphibian that laid them cannot be known because no body fossils are preserved with the eggs . The eggs are thought to be from dissorophoids because they are likely to be close relatives of modern amphibians , and probably had similar reproductive strategies . They are also the most common amphibians from the deposit in which the eggs were found . + One temnospondyl , the dvinosaur Trimerorhachis , may have brooded young in an area between the gills called the pharyngeal pouch . Small bones belonging to younger Trimerorhachis individuals have been found in these pouches . The living Darwin 's Frog is also a mouth brooder and would be the closest modern analogue to Trimerorhachis if it cared for its young in this way . An alternative possibility is that Trimerorhachis was cannibalistic , eating its young like many amphibians do today . If this was the case , the bones of these smaller individuals were originally located in the throat and were pushed into the pharyngeal pouch as the animal fossilized . + Body impressions of Early Carboniferous temnospondyls from Pennsylvania suggest that some terrestrial temnospondyls mated on land like some modern amphibians . They reproduced through internal fertilization rather than mating in water . The presence of three individuals in one block of sandstone shows that the temnospondyls were gregarious . The head of one individual rests under the tail of another in what may be a courtship display . Internal fertilization and similar courtship behavior are seen in modern salamanders . + + = = = Growth = = = + + While most types of temnospondyls are distinguished on the basis of features in mature specimens , several are known from juvenile and larval specimens . Metamorphosis is seen in dissorophoids , eryopids , and zatrachydids , with aquatic larvae developing into adults capable of living on land . Several types of dissorophoids do not fully metamorphose , but retain features of juveniles such as gills and small body size in what is known as neoteny . Dvinosaurians and the plagiosaurid Gerrothorax were also neotenic because they retained gills , but they are only known from adult specimens . + Temnospondyl larvae are often distinguished by poorly developed bones and the presence of a hyobranchial apparatus , a series of bones that gills would attach to in life . However , some fully mature temnospondyls also possess hyobranchial bones but did not have external gills . A dense covering of scales is also seen in larvae and adults . Major body changes occur in metamorphosis , including the reshaping and strengthening of skull bones , the thickening of postcranial bones , and an increase in body size . + Temnospondyls like Sclerocephalus are known from both large adult specimens and small larvae , showing an extreme change in body shape . In these species , the shape and proportions of skull bones change in the early stages of development . The ornamentation on the surface of the skull roof also develops at this time . Small , regularly spaced pits are the first to form , followed by larger ridges . As development continues , the external gills disappear . Small teeth that once covered the palate are lost . The postcranial skeleton does not develop at the same rate as the skull , with ossification ( the replacement of cartilage by bone ) happening more slowly . Vertebrae and limb bones are poorly developed , ribs and fingers are absent in the early stages , and the scapulocoracoid and ischium are entirely absent through most of development . Once maturity is reached , most bones have fully formed and growth rate slows . The bones of some temnospondyls like Dutuitosaurus show growth marks , possibly an indication that growth rate varied with the change in seasons . Fossils of temnospondyls like Metoposaurus and Cheliderpeton show that individuals grew larger past maturity . The oldest individuals usually have more pitting on their skulls with deeper sulci . + One group of temnospondyls , the Branchiosauridae , is also known from larval specimens . Branchiosaurids like Branchiosaurus and Apateon are represented by many fossils preserving skin and external gills . An entire growth series is exhibited in the wide range of sizes among specimens , but the lack of terrestrially adapted adult forms suggests that these temnospondyls were neotenic . Unlike other temnospondyls , their postcranial skeletons developed quickly but were still partly cartilaginous when fully mature . Adults likely had an aquatic lifestyle similar to juveniles . Recently , large specimens of Apateon gracilis were described with adaptations toward a terrestrial lifestyle , indicating that not all branchiosaurs were neotenic . + While most temnospondyls are aquatic in early stages of life , most metoposaurids appear to have been terrestrial in their juvenile stage . Like other Mesozoic temnospondyls , adult metoposaurids were adapted to a semiaquatic lifestyle . Their bones are not highly developed for movement on land . The cross @-@ sectional thickness of limb bones in adult metoposaurids shows that they could not withstand the stress of terrestrial locomotion . Juvenile individuals have bones that are thick enough to withstand this stress , and could probably move about on land . To maintain a terrestrial lifestyle , a temnospondyl 's limb bones would have to thicken with positive allometry , meaning that they would grow at a greater rate than the rest of the body . This is not the case in metoposaurids , meaning that as their bodies grew larger they became less adapted toward a terrestrial lifestyle . + + = = = Hearing = = = + + Temnospondyls and other early tetrapods have rounded otic notches in the back of the skull that project into the cheek region . In life , the otic notch would have been covered by a membrane called the tympanum , which is seen as a disk @-@ like area in living frogs . The tympanum is involved in hearing , and is similar to the ear drum of more advanced tetrapods . It was traditionally thought that the tympanum developed very early in tetrapod evolution as a hearing organ and progressed to form the ear drum of amniotes . Thus , temnospondyls possessed a hearing system supposedly ancestral to that of living amphibians and reptiles . + Frogs and all other living tetrapods have a rod @-@ like bone called the stapes that aids in hearing by transferring vibrations from the ear drum — or homologous tympanum — to the inner ear . Temnospondyls also have a stapes , which projects into the otic cavity . The stapes likely evolved from the hyomandibula of lobe @-@ finned fishes . The positioning of the stapes and the shape of the otic region suggests that the tympani of temnospondyls and frogs are homologous , but the tympani of these amphibians are no longer considered homologous with the hearing systems of reptiles , birds , and mammals . Therefore , ear structures in temnospondyls were not ancestral to those of all other tetrapods . + The ability of the tympanum and stapes to effectively transmit vibrations is called impedance matching . Early tetrapods like temnospondyls have thick stapes with poor impedance matching , so it is now thought that they were not used for hearing . Instead , these thick stapes may have functioned to support the tissue that covers the otic notch . Early temnospondyls like Dendrerpeton could not hear airborne sound but would have been able to detect vibration in the ground . Later temnospondyls like Doleserpeton had otic regions adapted to hearing . Doleserpeton has a structure in the inner ear called the perilymphatic duct , which is also seen in frogs and is associated with hearing . Its stapes is also a better transmitter of sound . The hearing system of Doleserpeton and related temnospondyls was able to detect airborne sound and may have been ancestral to that of living amphibians . + + + = Osbert de Bayeux = + + Osbert de Bayeux ( floruit 1121 to 1184 ) was a medieval English cleric and archdeacon in the Diocese of York . A relative of Thurstan , the Archbishop of York , Osbert probably owed his ecclesiastical positions to this relative . After Thurstan 's death , Osbert was opposed to one of the candidates for the archbishopric , William fitzHerbert , and worked to secure fitzHerbert 's deposition and replacement by Henry Murdac . After Murdac 's death in 1153 , Osbert tried to prevent the return of fitzHerbert , but these attempts were unsuccessful . When fitzHerbert died suddenly in 1154 , Osbert was accused of murdering the newly returned archbishop . Although he was never convicted of the murder in either a secular or an ecclesiastical court , he was stripped of his clerical status and became a layman before 1158 . He died after 1184 , perhaps even after 1194 . + + = = Early life = = + + Osbert was first mentioned in the historical record between 1121 and 1128 when he appears in a charter , which although likely a forgery , probably contains an authentic witness list . This document lists him as " Osbert archdeacon " , which means that he probably held the archdeaconry of Richmond . He was the nephew of Thurstan , who was Archbishop of York from 1114 to 1140 . Presumably he owed his position as archdeacon to his uncle and was probably appointed at a young age . A charter of Thurstan 's , dating to around 1138 , names Osbert explicitly as Thurstan 's nephew . + + = = Opposition to William fitzHerbert = = + + Osbert was opposed to the election of William fitzHerbert as Archbishop of York and supported William 's rival and successor Henry Murdac . Although he remained a supporter of Murdac after 1147 , he did oppose Murdac 's interventions in Selby Abbey , where Murdac had deposed one abbot and appointed another . In 1153 , Osbert deposed Murdac 's choice as abbot of Selby and appointed another abbot . Originally , Osbert had supported Elias Paynel , Murdac 's choice for abbot , but then changed his stance and helped with the deposition . + After Murdac 's death in 1153 , Osbert was opposed to William 's return as archbishop , but was unsuccessful in his attempts to prevent William 's reappointment . William died a week after his return to York , however , and Osbert , along with Robert of Ghent , the Dean of York , secured the quick election of the new archbishop , Roger de Pont L 'Évêque . + + = = Poisoning accusations = = + + Osbert was accused of murdering William , specifically by poisoning him through the communion chalice . A fellow cleric , Symphorian , who had been a chaplain of the deceased archbishop , brought murder charges against Osbert . Symphorian obtained a hearing on the charges at a royal council presided over by King Stephen of England at Michaelmas in 1154 , but Stephen 's subsequent death prevented a resolution . Osbert attempted to have the trial switched to an ecclesiastical court and was supported in his efforts by Archbishop Theobald of Canterbury . A trial was finally held in 1156 and Osbert 's accuser did not produce any witnesses , but Osbert was unable to prove his innocence , prompting the transfer of the case to a papal court . No record of any judgment exists , but Osbert apparently appeared before two popes , Adrian IV and Alexander III . A further appeal to the papal court was referred to papal judges @-@ delegate between 1175 and 1180 . + The case attracted commentary by two contemporary writers . John of Salisbury , who was a secretary for Theobald , added information about Osbert in a letter to Alexander III on unrelated business . In the section of the letter , John pointed out to the pope that no matter what others might say about Osbert , he had failed to secure other clergy willing to swear that he was innocent . Another contemporary , Gilbert Foliot , who was Bishop of Hereford , wrote to the pope to remind him that although Osbert 's accuser had offered to prove his accusations by undergoing a trial by ordeal , this was essentially meaningless since canon law forbade the clergy from the ordeal . + + = = Later life and death = = + + Osbert was no longer archdeacon by 1158 , as his successor is attested by that point . Osbert , however , continued to call himself " archdeacon " even though he held land as a secular lord , including lands in Lacy and Skipton . He also acted as a steward for Hugh de Tilly . Osbert was still alive in 1184 , as he was a witness to a document at York then , and may have been alive as late as 1194 , when Hugh Bardulf was responsible for the farm of Osbert 's lands , as the record of that transaction in the escheat roll is unclear if Osbert was alive at that time or dead . + Osbert had two sons , William de Bayeux and Turstin de Baius . Osbert was a benefactor to a number of monasteries , including Drax Priory , Pontefract Priory and Gisborough Priory . He also gave land to a hospital in York and to the Templars and Hospitallers . + + + = Dvorak technique = + + The Dvorak technique ( developed between 1969 and 1984 by Vernon Dvorak ) is a widely used system to estimate tropical cyclone intensity ( which includes tropical depression , tropical storm , and hurricane / typhoon / intense tropical cyclone intensities ) based solely on visible and infrared satellite images . Within the Dvorak satellite strength estimate for tropical cyclones , there are several visual patterns that a cyclone may take on which define the upper and lower bounds on its intensity . The primary patterns used are curved band pattern ( T1.0 @-@ T4.5 ) , shear pattern ( T1.5 @-@ T3.5 ) , central dense overcast ( CDO ) pattern ( T2.5 @-@ T5.0 ) , central cold cover ( CCC ) pattern , banding eye pattern ( T4.0 @-@ T4.5 ) , and eye pattern ( T4.5 - T8.0 ) . + Both the central dense overcast and embedded eye pattern use the size of the CDO . The CDO pattern intensities start at T2.5 , equivalent to minimal tropical storm intensity ( 40 mph , 65 km / h ) . The shape of the central dense overcast is also considered . The eye pattern utilizes the coldness of the cloud tops within the surrounding mass of thunderstorms and contrasts it with the temperature within the eye itself . The larger the temperature difference is , the stronger the tropical cyclone . Once a pattern is identified , the storm features ( such as length and curvature of banding features ) are further analyzed to arrive at a particular T @-@ number . The CCC pattern indicates little development is occurring , despite the cold cloud tops associated with the quickly evolving feature . + Several agencies issue Dvorak intensity numbers for tropical cyclones and their precursors , including the National Hurricane Center 's Tropical Analysis and Forecast Branch ( TAFB ) , the NOAA / NESDIS Satellite Analysis Branch ( SAB ) , and the Joint Typhoon Warning Center at the Naval Meteorology and Oceanography Command in Pearl Harbor , Hawaii . + + = = Evolution of the method = = + + The initial development of this technique occurred in 1969 by Vernon Dvorak , using satellite pictures of tropical cyclones within the northwest Pacific ocean . The system as it was initially conceived involved pattern matching of cloud features with a development and decay model . As the technique matured through the 1970s and 1980s , measurement of cloud features became dominant in defining tropical cyclone intensity and central pressure of the tropical cyclone 's low @-@ pressure area . Use of infrared satellite imagery led to a more objective assessment of the strength of tropical cyclones with eyes , using the cloud top temperatures within the eyewall and contrasting them with the warm temperatures within the eye itself . Constraints on short term intensity change are used less frequently than they were back in the 1970s and 1980s . The central pressures assigned to tropical cyclones have required modification , as the original estimates were 5 @-@ 10 hPa ( 0 @.@ 15 @-@ 0 @.@ 29 inHg ) too low in the Atlantic and up to 20 hPa ( 0 @.@ 59 inHg ) too high in the northwest Pacific . This led to the development of a separate wind @-@ pressure relationship for the northwest Pacific , devised by Atkinson and Holliday in 1975 , then modified in 1977 . + As human analysts using the technique lead to subjective biases , efforts have been made to make more objective estimates using computer programs , which have been aided by higher @-@ resolution satellite imagery and more powerful computers . Since tropical cyclone satellite patterns can fluctuate over time , automated techniques use a six @-@ hour averaging period to lead to more reliable intensity estimates . Development of the objective Dvorak technique began in 1998 , which performed best with tropical cyclones that had eyes ( of hurricane or typhoon strength ) . It still required a manual center placement , keeping some subjectivity within the process . By 2004 , an advanced objective Dvorak technique was developed which utilized banding features for systems below hurricane intensity and to objectively determine the tropical cyclone 's center . A central pressure bias was uncovered in 2004 relating to the slope of the tropopause and cloud top temperatures which change with latitude that helped improve central pressure estimates within the objective technique . + + = = Details of the method = = + + In a developing cyclone , the technique takes advantage of the fact that cyclones of similar intensity tend to have certain characteristic features , and as they strengthen , they tend to change in appearance in a predictable manner . The structure and organization of the tropical cyclone are tracked over 24 hours to determine if the storm has weakened , maintained its intensity , or strengthened . Various central cloud and banding features are compared with templates that show typical storm patterns and their associated intensity . If infrared satellite imagery is available for a cyclone with a visible eye pattern , then the technique utilizes the difference between the temperature of the warm eye and the surrounding cold cloud tops to determine intensity ( colder cloud tops generally indicate a more intense storm ) . In each case a " T @-@ number " ( an abbreviation for Tropical Number ) and a Current Intensity ( CI ) value are assigned to the storm . These measurements range between 1 ( minimum intensity ) and 8 ( maximum intensity ) . The T @-@ number and CI value are the same except for weakening storms , in which case the CI is higher . For weakening systems , the CI is held as the tropical cyclone intensity for 12 hours , though research from the National Hurricane Center indicates that six hours is more reasonable . The table at right shows the approximate surface wind speed and sea level pressure that corresponds to a given T @-@ number . The amount a tropical cyclone can change in strength per 24 ‑ hour period is limited to 2 @.@ 5 T @-@ numbers per day . + + = = = Pattern types = = = + + Within the Dvorak satellite strength estimate for tropical cyclones , there are several visual patterns that a cyclone may take on which define the upper and lower bounds on its intensity . The primary patterns used are curved band pattern ( T1.0 @-@ T4.5 ) , shear pattern ( T1.5 @-@ T3.5 ) , central dense overcast ( CDO ) pattern ( T2.5 @-@ T5.0 ) , banding eye pattern ( T4.0 @-@ T4.5 ) , eye pattern ( T4.5 - T8.0 ) , and central cold cover ( CCC ) pattern . Both the central dense overcast and embedded eye pattern utilize the size of the CDO . The CDO pattern intensities start at T2.5 , equivalent to minimal tropical storm intensity ( 40 miles per hour ( 64 km / h ) ) . The shape of the central dense overcast is also considered . The farther the center is tucked into the CDO , the stronger it is deemed . Tropical cyclones with maximum sustained winds between 65 miles per hour ( 105 km / h ) and 100 miles per hour ( 160 km / h ) can have their center of circulations obscured by cloudiness within visible and infrared satellite imagery , which makes diagnosis of their intensity a challenge . + The CCC pattern , with its large and quickly developing mass of thick cirrus clouds spreading out from an area of convection near a tropical cyclone center within a short time frame , indicates little development . When it develops , rainbands and cloud lines around the tropical cyclone weaken and the thick cloud shield obscures the circulation center . While it resembles a CDO pattern , it is rarely seen . + The eye pattern utilizes the coldness of the cloud tops within the surrounding mass of thunderstorms and contrasts it with the temperature within the eye itself . The larger the temperature difference is , the stronger the tropical cyclone . Winds within tropical cyclones can also be estimated by tracking features within the CDO using rapid scan geostationary satellite imagery , whose pictures are taken minutes apart rather than every half @-@ hour . + Once a pattern is identified , the storm features ( such as length and curvature of banding features ) are further analyzed to arrive at a particular T @-@ number . + + = = Usage = = + + Several agencies issue Dvorak intensity numbers for tropical cyclones and their precursors . These include the National Hurricane Center 's Tropical Analysis and Forecast Branch ( TAFB ) , the National Oceanic and Atmospheric Administration 's Satellite Analysis Branch ( SAB ) , and the Joint Typhoon Warning Center at the Naval Pacific Meteorology and Oceanography Center in Pearl Harbor , Hawaii . + The National Hurricane Center will often quote Dvorak T @-@ numbers in their tropical cyclone products . The following example is from discussion number 3 of Tropical Depression 24 ( eventually Hurricane Wilma ) of the 2005 Atlantic hurricane season : + BOTH TAFB AND SAB CAME IN WITH A DVORAK SATELLITE INTENSITY ESTIMATE OF T2.5 / 35 KT . HOWEVER ... OFTENTIMES THE SURFACE WIND FIELD OF LARGE DEVELOPING LOW PRESSURE SYSTEMS LIKE THIS ONE WILL LAG ABOUT 12 HOURS BEHIND THE SATELLITE SIGNATURE . THEREFORE ... THE INITIAL INTENSITY HAS ONLY BEEN INCREASED TO 30 KT . + Note that in this case the Dvorak T @-@ number ( in this case T2.5 ) was simply used as a guide but other factors determined how the NHC decided to set the system 's intensity . + The Cooperative Institute for Meteorological Satellite Studies ( CIMSS ) at the University of Wisconsin – Madison has developed the Objective Dvorak Technique ( ODT ) . This is a modified version of the Dvorak technique which uses computer algorithms rather than subjective human interpretation to arrive at a CI number . This is generally not implemented for tropical depressions or weak tropical storms . The China Meteorological Agency ( CMA ) is expected to start using the standard 1984 version of Dvorak in the near future . The Indian Meteorological Department ( IMD ) prefers using visible satellite imagery over infrared imagery due to a perceived high bias in estimates derived from infrared imagery during the early morning hours of convective maximum . The Japan Meteorological Agency ( JMA ) uses the infrared version of Dvorak over the visible imagery version . Hong Kong Observatory and JMA continue to utilize Dvorak after tropical cyclone landfall . Various centers hold on to the maximum current intensity for 6 – 12 hours , though this rule is broken when rapid weakening is obvious . + Citizen science site Cyclone Center uses a modified version of the Dvorak technique to categorize post @-@ 1970 tropical weather . + Satellite Images of Selected Tropical Storms and Associated T @-@ Number + + = = Benefits and disadvantages = = + + The most significant benefit of the use of the technique is that it has provided a more complete history of tropical cyclone intensity in areas where aircraft reconnaissance is neither possible nor routinely available . Intensity estimates of maximum sustained wind are currently within 5 miles per hour ( 8 @.@ 0 km / h ) of what aircraft are able to measure half of the time , though the assignment of intensity of systems with strengths between moderate tropical @-@ storm force ( 60 miles per hour ( 97 km / h ) ) and weak hurricane- or typhoon @-@ force ( 100 miles per hour ( 160 km / h ) ) is the least certain . Its overall precision has not always been true , as refinements in the technique led to intensity changes between 1972 and 1977 of up to 20 miles per hour ( 32 km / h ) . The method is internally consistent in that it constrains rapid increases or decreases in tropical cyclone intensity . Some tropical cyclones fluctuate in strength more than the 2 @.@ 5 T numbers per day limit allowed by the rule , which can work to the technique 's disadvantage and has led to occasional abandonment of the constraints since the 1980s . Systems with small eyes near the limb , or edge , of a satellite image can be biased too weakly using the technique , which can be resolved through use of polar @-@ orbiting satellite imagery . Subtropical cyclone intensity cannot be determined using Dvorak , which led to the development of the Hebert @-@ Poteat technique in 1975 . Cyclones undergoing extratropical transition , losing their thunderstorm activity , see their intensities underestimated using the Dvorak technique . This led to the development of the Miller and Lander extratropical transition technique which can be used under these circumstances . + + + = New York State Route 31B = + + New York State Route 31B ( NY 31B ) was a state highway in central New York in the United States . It served as a connector between NY 31 , its parent route , in the Cayuga County village of Weedsport and NY 5 in the Onondaga County town of Elbridge . NY 31B was assigned c . 1933 , replacing New York State Route 293 , a route assigned as part of the 1930 renumbering of state highways in New York . The NY 31B designation was removed in 1980 and replaced by County Route 31B ( CR 31B ) in Cayuga County and CR 99 in Onondaga County . + + = = Route description = = + + NY 31B began at an intersection with its parent route , NY 31 , in the Cayuga County village of Weedsport . The highway went eastward , intersecting with NY 34 less than 0 @.@ 1 miles ( 0 @.@ 2 km ) later . Much of Weedsport was urbanized , with the highway passing residential homes and businesses as it progressed eastward through the village . The highway intersected with CR 12B before leaving Weedsport and entering the town of Brutus as Brutus Road . + In Brutus , NY 31B continued to the east through the rural town , intersecting CR 136A and passing the Weedsport Rural Cemetery before turning to the southeast . After a short distance , the highway went through an isolated area of homes , where it intersected with CR 14 and CR 15A . NY 31B continued on , intersecting several local highways before crossing into Onondaga County and terminating at an intersection with NY 5 in the town of Elbridge . + + = = History = = + + What became NY 31B was originally designated as NY 293 as part of the 1930 renumbering of state highways in New York . NY 293 was renumbered to NY 31B c . 1933 , allowing the NY 293 designation to be transferred to another highway in Orange County . NY 31B remained unchanged until January 2 , 1980 , when the NY 31B designation was removed . + Ownership and maintenance of NY 31B 's former routing was gradually transferred to the counties it went through , namely Cayuga and Onondaga . The first section to be transferred was the portion within Onondaga County , which was given to the county on April 1 , 1980 , and designated as CR 99 . The Cayuga County section of former NY 31B was transferred to the county one year later on April 1 , 1981 , and designated as CR 31B for the state route that preceded it . + + = = Major intersections = = + + + + = Ben Amos = + + Benjamin Paul " Ben " Amos ( born 10 April 1990 ) is an English professional footballer who plays as a goalkeeper for Bolton Wanderers . Born in Macclesfield , Cheshire , Amos began his career with Crewe Alexandra 's youth academy , but joined Manchester United at the age of 11 . He has spent time on loan at Peterborough United , Molde , and Oldham Athletic . Additionally , Amos is an England youth international , having represented his country at every level from Under @-@ 16 to Under @-@ 21 . + + = = Club career = = + + + = = = Crewe Alexandra = = = + + Amos was born in Macclesfield , Cheshire and was a member of the Crewe Alexandra academy until he was released at the age of 10 . While at Crewe , he also played for another local team , Bollington United , as a centre midfielder . One year later , Amos was playing for his local team against the team at the top of the table , and they needed to win the match to win the league . Amos ' team 's goalkeeper was injured during the game , and as the tallest on the team , Amos was put in goal ; however , he had also been the team 's regular penalty taker all season , so when they were awarded a penalty , he went all the way up the pitch to take the kick . He scored , and his team went on to win the game 3 – 2 , together with the league title . After the game , his parents told him that a Manchester United scout had been watching him and that he had been invited for trials . Amos joined Manchester United at the age of 11 . + + = = = Manchester United = = = + + In his first season at Manchester United , Amos became a regular starter for the club 's Under @-@ 13 team , playing in 19 out of 27 matches in the 2001 – 02 season . Amos was named as an unused substitute for the Under @-@ 18 side for the first time on 8 January 2005 , for a league game against Manchester City . His first appearance for the Under @-@ 18s came exactly nine months later , on 8 October 2005 , coming on as a substitute for Danny Rose after starting goalkeeper Ron @-@ Robert Zieler was sent off in a 2 – 0 defeat to Bolton Wanderers . He was regularly named as an unused substitute during the 2005 – 06 season – including for two reserve team matches – but became a frequent starter for the Under @-@ 18s in 2006 – 07 after signing a trainee contract in July 2006 . However , he missed the final of the 2006 – 07 FA Youth Cup with a dislocated shoulder . + He retained his place in the Under @-@ 18 team for 2007 – 08 , in addition to making his debut for the reserve team against Wigan Athletic on 7 November 2007 , and during the season he impressed enough to be selected to go on the first @-@ team 's 2008 summer tour of South Africa . Amos was named as a substitute for all three matches of the tour , but did not play . En route back from South Africa , United stopped off in Nigeria to play against Portsmouth on 27 July 2008 , with Amos replacing Tomasz Kuszczak after 76 minutes . He made his competitive first @-@ team debut on 23 September 2008 in a 3 – 1 win at home to Middlesbrough in the third round of the League Cup . On 14 December 2008 , Amos travelled to Japan with the Manchester United squad for the 2008 FIFA Club World Cup , having been called up as a late replacement for Ben Foster , who had suffered a hand injury while training . + + = = = Loans to Peterborough and Molde = = = + + On 29 October 2009 , Amos signed for the Championship team Peterborough United on a month 's loan as cover for Peterborough 's suspended first @-@ choice goalkeeper Joe Lewis . He made his only appearance on 31 October in a 2 – 1 defeat against Barnsley . After returning to Manchester United , Amos was again sent out on loan in March 2010 , this time to Norwegian side Molde FK , where he remained on loan until 30 June 2010 . + + = = = Return to Manchester United = = = + + Following the departure of Ben Foster from Manchester United to Birmingham City , United manager Alex Ferguson declared that Amos would be Manchester United 's third @-@ choice goalkeeper for the 2010 – 11 season behind Edwin van der Sar and Tomasz Kuszczak . He made his first appearance of the season on 26 October 2010 , starting in goal for United 's 3 – 2 win over Wolverhampton Wanderers in the fourth round of the League Cup . + In United 's final Champions League group match on 7 December 2010 , Amos was picked to start against Valencia at Old Trafford . Pablo Hernández scored Valencia 's only goal past him after 32 minutes of the match – the first goal United had conceded in the Champions League that season – as the two sides played out a 1 – 1 draw . + + = = = Loan to Oldham Athletic = = = + + With the signing of Danish goalkeeper Anders Lindegaard , Manchester United allowed Amos to join Oldham Athletic on loan for the remainder of the season on 7 January 2011 , although he would continue to train with Manchester United once a week . He made his debut against Swindon Town the next day and kept a clean sheet . However , three days later , he conceded all six goals in a 6 – 0 defeat at home to Southampton ; he allowed Adam Lallana 's 20 @-@ yard shot underneath his body for the second goal , and he was rounded by Lee Barnard for the sixth . + On 15 March 2011 , Lindegaard was ruled out for five weeks following a knee injury , so Amos was recalled from Oldham to cover for Edwin van der Sar and Tomasz Kuszczak . + + = = = Return to Manchester United = = = + + Amos made his first start of the 2011 – 12 season in a third round League Cup tie at Elland Road against Leeds United . He kept a clean sheet as United cruised through to the next round winning 3 – 0 . He played again in the following round away at Aldershot Town , a game which United also won 3 – 0 and advanced to the quarter finals . He was in goal again for United 's League Cup game in a defeat against Championship club Crystal Palace on 30 November . It also seemed that he had moved above last season 's second choice Tomasz Kuszczak in the pecking order , but was now third choice behind Anders Lindegaard and David de Gea . He made his first Premier League start in a 2 – 0 home win against Stoke City on 31 January 2012 , keeping a clean sheet on his debut . + On 30 May 2012 , Amos signed a three @-@ year contract extension with Manchester United , which will keep him at the club until at least 2015 . + + = = = Loan to Hull City = = = + + On 31 July 2012 , Amos joined Championship team Hull City on a season @-@ long loan from Manchester United . Before the loan deal was completed , he had joined Hull on their pre @-@ season training camp in Portugal . He made his debut on 11 August 2012 in 7 – 6 penalty shoot @-@ out victory over Rotherham United in the first round of the 2012 – 13 Football League Cup . However , after 19 appearances , including two in the League Cup , Amos returned to Manchester United on 3 January 2013 . + + = = = Loan to Carlisle United = = = + + On 15 November 2013 , Amos joined League One team Carlisle United on a month @-@ long loan from Manchester United . + + = = = Loan to Bolton Wanderers = = = + + On 30 January 2015 he joined Championship club Bolton Wanderers on a month 's loan to provide competition for Andy Lonergan after Ádám Bogdán was ruled out with an ankle injury picked up in training . On 21 February 2015 he made his Bolton debut as a substitute for the injured Lonergan as Wanderers lost 4 – 1 to Nottingham Forest at the City Ground . He played nine Championship games for Bolton before his loan expired on 4 April . + + = = = Return to Manchester United = = = + + Ben Amos returned to Manchester United after his loan spell at Bolton Wanderers , but on 10 June 2015 , it was announced on Manchester United website that Ben Amos has been released from the club along with four other players . + + = = = Bolton Wanderers = = = + + On 1 July 2015 , Amos returned to Bolton Wanderers following his release from Manchester United , signing a four @-@ year contract with the club . + + = = International career = = + + Amos is an England youth international , having played for his country at the Under @-@ 16 , Under @-@ 17 , Under @-@ 18 , Under @-@ 19 , Under @-@ 20 and Under @-@ 21 levels . He was called up to the Under @-@ 21 squad in 2011 for their friendly match against Iceland , but he was an unused substitute . + + = = Career statistics = = + + As of 1 December 2015 + + = = Honours = = + + + = = = Club = = = + + Manchester United + FIFA Club World Cup ( 1 ) : 2008 + + = = Personal life = = + + Amos attended Fallibroome High School , where he earned 11 GCSEs at grade C or above . + + + = Clayton Kershaw = + + Clayton Edward Kershaw ( born March 19 , 1988 ) is an American professional baseball pitcher for the Los Angeles Dodgers of Major League Baseball ( MLB ) . A left @-@ handed starting pitcher , Kershaw has played in the major leagues since 2008 , and his career earned run average ( ERA ) and Walks and hits per innings pitched average ( WHIP ) are the lowest among starters in the live @-@ ball era with a minimum of 1 @,@ 000 innings pitched . With his Career Hits Allowed Per Nine Innings Pitched average ( 6 @.@ 64 ) Which is the second lowest in MLB History , a three @-@ time Cy Young Award winner , the 2014 National League Most Valuable Player and Los Angeles Dodgers All Time Leader in walks and hits per innings pitched ( 1 @.@ 01 ) and hits allowed per nine innings pitched ( 6 @.@ 64 ) , Kershaw is considered by many to be the best pitcher in MLB . + Kershaw was drafted seventh overall in the 2006 MLB draft . He worked his way through the Dodgers ' farm system in just one full season , and reached the majors at 20 years old . When he debuted in 2008 , he was the youngest player in MLB , a title he held for one full year . In 2011 , he won the pitching Triple Crown and the National League Cy Young Award , becoming the youngest pitcher to accomplish either of these feats since Dwight Gooden in 1985 . Kershaw pitched a no @-@ hitter on June 18 , 2014 , becoming the 22nd Dodger to do so . Being a left @-@ handed strikeout pitcher and playing for the Los Angeles Dodgers , Kershaw has often been compared to Hall of Fame pitcher Sandy Koufax . He became the first pitcher in history to lead MLB in ERA for four consecutive years when he did so in the 2011 through 2014 seasons . + Off the field , Kershaw is an active participant in volunteer work . He and his wife , Ellen , launched " Kershaw 's Challenge " and wrote the book Arise to raise money to build an orphanage in Zambia . He has been honored with the Roberto Clemente Award and the Branch Rickey Award for his humanitarian work . + + = = Early life = = + + Kershaw was born in Dallas , Texas on March 19 , 1988 . His parents divorced when he was 10 , and he was raised by his mother . He played in youth sports leagues as a child , including Little League Baseball . + Kershaw attended nearby Highland Park High School , where he played baseball and was also the center for quarterback Matthew Stafford on the football varsity . After a growth spurt and further development of his pitches , he established himself as an elite high school prospect in 2006 when he posted a 13 – 0 record with an earned run average ( ERA ) of 0 @.@ 77 , and recorded 139 strikeouts in 64 innings pitched . In a playoff game against Northwest High School of Justin , Texas , Kershaw pitched an all @-@ strikeout perfect game . He struck out all 15 batters he faced in the game , which was shortened because of the mercy rule . He also pitched for USA Baseball 's Junior National Team in the Pan Am Championship . Kershaw was selected by USA Today as " High School Baseball Player of the Year " , and was also the Gatorade National Player of the Year for baseball . + Entering the 2006 Major League Baseball ( MLB ) draft , Kershaw was considered the top high @-@ school pitcher available . The Los Angeles Dodgers selected Kershaw with the seventh overall pick in the draft . He had committed to Texas A & M University , but turned down the scholarship offer to sign with the Dodgers , with a bonus estimated at $ 2 @.@ 3 million . The bonus was the largest to any Dodgers draft pick at the time , and was eventually topped by Zach Lee in the 2010 draft . + + = = Professional career = = + + + = = = Minor Leagues = = = + + Kershaw began his career with the Gulf Coast League ( GCL ) Dodgers . He pitched in 37 innings in which he struck out 54 batters ( walking only 5 ) , while compiling a record of 2 – 0 with a 1 @.@ 95 ERA . He featured a fastball that topped out at 96 miles per hour ( 154 km / h ) and he was rated as the top prospect in the GCL , and the Dodgers ' second best prospect by Baseball America behind third baseman Andy LaRoche . + Kershaw was promoted to the Great Lakes Loons in 2007 , where he recorded a record of 7 – 5 with a 2 @.@ 77 ERA . He was selected to play on the East Team in the Midwest League All @-@ Star Game and on the USA team in the All @-@ Star Futures Game . On August 6 , he was promoted to the Double @-@ A Jacksonville Suns in the Southern League , where he produced a 1 – 2 record and 3 @.@ 65 ERA in five starts and was selected as the top prospect in the Dodgers organization heading into the 2008 season . + During spring training in a game against the Boston Red Sox , Kershaw gained much attention for throwing a curveball to Sean Casey that started behind Casey but at the end looped into the strike zone and struck him out looking . Kershaw was 0 – 3 and had a 2 @.@ 28 ERA with 47 strikeouts through 431 ⁄ 3 innings pitched in his first stint of the year with the Suns . He was then called up to the majors on May 28 , 2008 , but optioned back to Jacksonville on July 2 . + Kershaw pitched 18 innings during his second trip to Jacksonville ( two starts and one seven inning relief appearance ) , winning two games . During this stretch , he allowed only two runs earned runs , lowering his ERA to 1 @.@ 91 . He was recalled on July 22 . + + = = = Los Angeles Dodgers = = = + + + = = = = 2008 – 2010 seasons : Early career = = = = + + On May 24 , 2008 , the Dodgers bought Kershaw 's minor @-@ league contract , and he was added to the active roster . Sportswriter Tony Jackson called Kershaw 's debut the most anticipated start by a Dodgers pitcher since Hideo Nomo 's major league debut during the 1995 season . He made his debut on May 25 , starting against the St. Louis Cardinals . He struck out the first batter he faced , Skip Schumaker , the first of seven strikeouts in the game , in which he pitched six innings and allowed two runs . When he debuted , Kershaw was the youngest player in MLB , a title he held for one full year . + Kershaw won his first major league game against the Washington Nationals on July 27 , 2008 . He pitched six @-@ plus shutout innings , allowing four hits , a walk , and he struck out five . Kershaw finished his rookie season 5 – 5 , with a 4 @.@ 26 ERA in 22 games ( 21 starts ) . He also pitched two innings out of the bullpen for the Dodgers in the 2008 National League Championship Series ( NLCS ) against the Philadelphia Phillies . + On April 15 , 2009 , Kershaw pitched seven innings , striking out 13 batters while allowing only one hit ( a solo home run ) against the rival San Francisco Giants . He was the youngest Dodger to ever strikeout 13 or more batters in a game since Sandy Koufax did it in the 1955 season . On May 17 , 2009 , Kershaw did not allow a hit against the Florida Marlins through 7 innings , then gave up a lead @-@ off double to Florida 's Cody Ross . In 2009 , despite an 8 – 8 record , he led the major leagues in opposing batting average ( .200 ) , opposing slugging percentage ( .282 ) , and hits per nine innings ( 6 @.@ 26 ) . He also posted an ERA of 2 @.@ 79 and 185 strikeouts . Kershaw also walked 91 batters , which was second most in the National League ( NL ) . + Kershaw made his playoff starting debut against the St. Louis Cardinals in the 2009 National League Division Series ( NLDS ) . He went 62 ⁄ 3 innings , striking out 4 , walking 1 , and ended up getting a no @-@ decision ( the Dodgers went on to win the game in the 9th inning ) . At 21 years old , he started the opener of the 2009 NLCS against the Philadelphia Phillies and was the third youngest pitcher to ever start a playoff series opener , behind only Fernando Valenzuela in the 1981 NLDS and Rick Ankiel in the 2000 NLDS . + Kershaw started the 2010 season by posting a 3 @.@ 07 ERA in April , but did so by walking 22 batters in 29 innings . On May 4 , he had his worst start of his career against the Milwaukee Brewers at Dodger Stadium , throwing just 57 pitches in 11 ⁄ 3 innings , while retiring only four of the 13 batters he faced — including the pitcher . He was booed loudly upon being pulled from the game . Kershaw said after the game , " I didn 't give our team any kind of chance . It 's just not a good feeling to let your teammates down , let everybody down . It stings , it hurts . I 've got to figure things out . " + Kershaw rebounded his next start by pitching an 8 inning two @-@ hitter and out @-@ dueling the then undefeated Ubaldo Jiménez . He credited his control of the slider being the major turning point for him . Later in the season , he was suspended for five games after hitting Aaron Rowand of the Giants with a pitch in a game on July 20 . The incident occurred after both teams were given a warning following Giants ace Tim Lincecum hitting Matt Kemp earlier in the game . He threw his first career complete game shutout on September 14 , 2010 also against San Francisco and finished the season with a record of 13 – 10 and a 2 @.@ 91 ERA in 32 starts , pitching 2041 ⁄ 3 innings and recording 212 strikeouts . + + = = = = 2011 season : 1st Cy Young Award = = = = + + After finishing the 2010 season strong , the Dodgers named Kershaw as the Opening Day Starter for the 2011 season . On May 29 , he pitched the second complete @-@ game shutout of his career , striking out 10 while winning a two @-@ hitter against the Florida Marlins , 8 – 0 ; he also had two singles and an RBI , scoring twice in the game . He produced his third career shutout on June 20 , a two @-@ hit , 11 @-@ strikeout effort against the Detroit Tigers . Kershaw became the first Dodgers starter to strike out the side in the 9th inning since Sandy Koufax 's perfect game . In his next start , on June 26 , Kershaw pitched another complete game ( against the Los Angeles Angels of Anaheim ) . He became the first Dodger starter to have back @-@ to @-@ back complete game victories since Jeff Weaver in the 2005 season and the first Dodger to have double @-@ digit strikeouts in consecutive starts since Chan @-@ Ho Park in the 2000 season . He was awarded the National League Player of the Week award for the week of June 20 – 26 as a result of those two starts . Midway through June , Kershaw had amassed 32 career victories , a 3 @.@ 15 ERA and 593 career strikeouts in 568 @.@ 2 innings . According to the Elias Sports Bureau , Kershaw was the first 23 @-@ year @-@ old pitcher to have that many victories , an ERA that low and an average of more than one strikeout per inning since ERA became an official statistic in 1910 . + Kershaw was selected to the National League team for the 2011 Major League Baseball All @-@ Star Game , his first All @-@ Star selection . In the month of July , Kershaw was 4 – 1 with a 2 @.@ 02 ERA and NL @-@ leading 45 strikeouts , earning him the National League Pitcher of the Month Award . On August 23 , he struck out Matt Holliday of the St. Louis Cardinals for his 200th strikeout of the season and became the 10th Dodger pitcher to record back @-@ to @-@ back 200 strikeout seasons and the first since Chan @-@ Ho Park did it in the 2001 season . + Kershaw finished the 2011 season by leading the NL with 21 wins , 248 strikeouts and a 2 @.@ 28 ERA , winning the NL pitching Triple Crown , the first Triple Crown winner since Jake Peavy of the 2007 San Diego Padres and the first Dodger since Sandy Koufax won it in the 1966 season . Justin Verlander of the Detroit Tigers won the American League Triple Crown the same season , marking the first major @-@ league season since 1924 to feature Triple Crown @-@ winning pitchers in both leagues . Kershaw 's 21 wins were the most by a Dodger pitcher since Orel Hershiser won 23 during the 1988 season . His ERA was the lowest by a Dodger since Hershiser 's 2 @.@ 03 in the 1985 season , his strikeouts were the most by a Dodger since Koufax 's 317 in 1966 and his 233 1 ⁄ 3 innings pitched were the most since Chan Ho Park pitched 234 in 2001 . Since 1965 when Koufax did it , Peavy and Kershaw are only two pitchers in the National League have led the league in wins , strikeouts , ERA , and WHIP ( walks plus hits per inning pitched ) . Kershaw also became just the second lefthander to have a 240 @-@ plus strikeouts in a season before the age of 24 , joining Vida Blue . + After the season , Kershaw was awarded the Warren Spahn Award as the best left @-@ handed pitcher in 2011 , the Players Choice Award for Most Outstanding National League pitcher , the Gold Glove Award as the top fielding pitcher in the NL and the Sporting News ( TSN ) National League Pitcher of the Year . He was additionally selected as the starting pitcher for the TSN NL All @-@ Star Team . On November 17 , he was honored with the National League Cy Young Award , making him the youngest Cy Young winner since Dwight Gooden of the 1985 New York Mets . He was the 8th Dodger pitcher to win the award , the first since Éric Gagné in the 2003 season . + + = = = = 2012 season : Cy Young runner @-@ up = = = = + + On February 7 , 2012 , Kershaw and the Dodgers agreed on a two @-@ year , $ 19 million contract . The contract was the second highest for a player in his first year of arbitration ( after Tim Lincecum 's $ 23 million 2 @-@ year contract in 2010 ) . + Kershaw was the Dodgers ' Opening Day starter for the second year in a row , where he pitched three innings of shutout ball against the San Diego Padres at Petco Park before being removed from the game due to flu @-@ like symptoms . On April 27 , he was able to last through eight innings for his second win of the season against the Washington Nationals . The win was also his 12th straight home win , tying him with Ed Roebuck ( June 1960 – August 1962 ) and Orel Hershiser ( September 1984 – October 1985 ) for the longest home winning streak since the Dodgers moved to Los Angeles . Kershaw won the National League 's Player of the Week Award for the week of May 14 – 20 after he made two starts during that week and pitched 16 scoreless innings , including his fourth career shutout . Kershaw was selected to appear in the 2012 Major League Baseball All @-@ Star Game , the second straight year he made the team . On August 11 , he went over 200 innings on the season , becoming the 12th Los Angeles Dodger pitcher with three or more seasons of 200 or more innings , and the first since Hershiser did it five times from 1985 to 1989 . Kershaw also became just the fifth Dodger pitcher with three straight 200 strikeout seasons . + Kershaw finished 2012 with a 14 – 9 record , a 2 @.@ 53 ERA ( leading the league ) , 229 strikeouts , and 2272 ⁄ 3 innings pitched , coming second in both categories . He became the first pitcher to lead the league in ERA in consecutive seasons since Arizona 's Randy Johnson in 2001 – 02 . This was also marked his fourth year in a row with a sub @-@ 3 @.@ 00 ERA , making him the first to do this since Randy Johnson from 1999 – 2002 . He finished second for the NL Cy Young behind R. A. Dickey , receiving two first place votes . + + = = = = 2013 season : 2nd Cy Young Award = = = = + + Kershaw made his third straight opening day start for the Dodgers in the 2013 season , the first Dodger starter to do so since Derek Lowe ( 2005 – 2007 ) . In that opening day start he pitched a complete game , four hit , shutout over the Giants and also hit his first career home run . He was the first pitcher to throw a shutout and hit a home run on opening day since Bob Lemon of the Cleveland Indians did so against the Chicago White Sox on April 14 , 1953 . Kershaw picked up his 1,000th career strikeout on April 17 , 2013 , when he struck out Yonder Alonso of the Padres . He was the second youngest Dodger to reach that mark , behind only Fernando Valenzuela . On May 14 , Kershaw passed the 1 @,@ 000 inning mark for his career . His ERA of 2 @.@ 70 at the time was the fifth best of the live @-@ ball era at the 1 @,@ 000 inning mark and the best career mark . He also threw 130 pitches that day , the most of his career and the most by a Dodger pitcher since Odalis Pérez in the 2003 season . + Kershaw was selected to the 2013 Major League Baseball All @-@ Star Game , his third straight selection . In July , he compiled a 4 – 1 record and 1 @.@ 34 ERA in six starts and was awarded his second National League Pitcher of the Month Award . On September 2 , Kershaw picked up his 200th strikeout of 2013 , joining Hall of Famers Sandy Koufax and Don Drysdale as the only starters in Dodgers history with at least 4 consecutive seasons of more than 200 strikeouts . + Kershaw finished the season with a 16 @-@ 9 record , 236 innings pitched ( a career high ) , and a Major League best 1 @.@ 83 ERA and 0 @.@ 92 WHIP . He was the third player in history to lead the Majors in ERA three years in a row , joining Greg Maddux ( 1993 – 95 ) and Lefty Grove ( 1929 – 31 ) . His ERA was the first sub @-@ 2 @.@ 00 ERA since Roger Clemens did it in the 2005 season and the lowest overall since Pedro Martínez in the 2000 season . He was only the third Dodger pitcher to have an ERA under 3 @.@ 00 in five consecutive seasons ( Koufax and Nap Rucker ) . + Kershaw struck out 12 batters in seven innings in the first game of the 2013 National League Division Series . That was the third most strikeouts by a Dodger pitcher in the playoffs , behind only Koufax ( 15 in the 1963 World Series ) and Carl Erskine ( 14 in the 1953 World Series ) . His six straight strikeouts in the game tied a MLB postseason record set by Tim Belcher in the second game of the 1988 World Series . He picked up his first career postseason victory in that game . + Kershaw won the Warren Spahn Award for 2013 , the second time he had won the award , which honors the best left @-@ handed pitcher in the Major Leagues . He was also selected to the Sporting News NL All @-@ Star team , the fourth Dodger pitcher to be named to the team twice ( after Koufax , Valenzuela and Don Newcombe ) . On November 13 , he won the NL Cy Young Award for the second time in three seasons . He became just the sixth pitcher in history to finish in the top two in voting three seasons in a row . + After the season , Kershaw and the Dodgers agreed on a seven @-@ year , $ 215 million , contract extension . The deal was the richest in MLB history for a pitcher , eclipsing the seven @-@ year , $ 180 million , contract signed by Justin Verlander the previous year . The average annual value of $ 30 @.@ 7 million was also the largest ever for a baseball player , beating the $ 28 million Roger Clemens received in 2007 and the 10 @-@ year , $ 275 million contract that Alex Rodriguez signed that same year . + + = = = = 2014 season : MVP and 3rd Cy Young Award = = = = + + Kershaw made his fourth straight opening day start for the Dodgers in 2014 , only the fourth Dodger ever to do so . This season the game was played at the Sydney Cricket Ground in Australia . Before his second start , Kershaw felt some pain in his back and was placed on the disabled list for the first time in his career . He did not rejoin the Dodgers until early May . On June 18 , he pitched a complete game no @-@ hitter against the Colorado Rockies and struck out a career @-@ high 15 batters . The only batter to reach base was due to an error in the top of the seventh inning , costing Kershaw a perfect game . He is the only pitcher in MLB history with 15 strikeouts in a game while allowing no hits and no walks . Kershaw was 6 @-@ 0 with an 0 @.@ 82 ERA in June and was awarded with his third career Pitcher of the Month award . He was selected to the National League squad at the 2014 Major League Baseball All @-@ Star Game , his fourth straight selection . He was the sixth Dodger pitcher , and the first since Fernando Valenzuela to make the All @-@ Star team four years in a row . + Kershaw had a 41 inning scoreless inning streak that ended in the top of the sixth inning on July 10 when , with two outs , Chase Headley homered to left field at Dodger Stadium . Kershaw 's streak was , at the time , tied for the fifteenth longest scoreless inning streak in MLB history . He won the pitcher of the month award again in July , the third Dodger ( along with Don Sutton and Burt Hooton ) to win it two months in a row . He was 4 – 0 with a 1 @.@ 10 ERA in the month with 48 strikeouts and only 10 walks . He picked up his 200th strikeout of the season on September 2 , the fifth year in a row he had reached that number , trailing only the six seasons in a row for Sandy Koufax among Dodger starters . He also became just the fourth pitcher since 1893 to have at least five 200 @-@ strikeout seasons through an age @-@ 26 season ( Bert Blyleven , Walter Johnson and Sam McDowell are the others ) . + Kershaw finished the season 21 – 3 with a 1 @.@ 77 ERA in 27 starts . He led the National League in numerous categories once again , such as ERA , ERA + , Wins , Win % , WHIP , IP / GS , SO / 9 , Strikeout @-@ to @-@ walk ratio , complete games , FIP , and Wins Above Replacement for both pitchers and all NL players . He also finished third in strikeouts despite missing most of the first month of the season . He was the first pitcher in history to win four consecutive ERA titles . Many experts called his 2014 season one of the best pitching seasons in recent memory . + However , in his first start of the playoffs , in Game 1 of the Division Series against the Cardinals , Kershaw became the first pitcher in history to strike out 10 while allowing eight runs . He had cruised through the first six innings while allowing only two hits ( both solo homers ) and surrendered six runs in the seventh . He did tie Koufax for the only Dodgers pitchers with multiple double digit strikeout games in the playoffs . He was also the first pitcher in history to give up at least seven runs in back @-@ to @-@ back postseason starts ( his previous one was Game 6 of the 2013 National League Championship Series ) . Pitching on short rest in Game 4 , he would again be dominant , but again would take the loss after giving up a 3 @-@ run home run to Matt Adams in the 7th inning . It was the first home run Kershaw had allowed in his career to a left @-@ handed batter off his curveball . + Kershaw was honored after the season with player of the year awards from both The Sporting News and Baseball America . He won three awards at the Players Choice Awards including Outstanding NL Pitcher , Player of the Year and the Marvin Miller Man of the Year Award . He also won his third ( and second straight ) Warren Spahn Award . On November 12 , he was awarded his third Cy Young Award in four seasons ( a unanimous vote ) . The following day , he was elected as the NL MVP , the first National League pitcher to win the award since Bob Gibson in 1968 and the first Dodgers player to win the award since Kirk Gibson in 1988 . + + = = = = 2015 season : 300 @-@ strikeout season = = = = + + Kershaw made his fifth straight opening day start in 2015 , the first Dodgers pitcher to do so since Hall of Famer Don Sutton started seven in a row from 1972 through 1978 . He recorded his 1,500th career strikeout on May 10 when he fanned Drew Stubbs of the Colorado Rockies . Kershaw picked up his 100th career win on May 15 against the Rockies . He became the 22nd pitcher in franchise history and the second youngest active pitcher to reach that mark . Kershaw won his sixth career NL player of the week award for the week of June 1 – 7 , 2015 , when he allowed only two runs on 10 baserunners in 15 innings while striking out 18 in two starts that week . Kershaw did not make the initial NL roster for the 2015 All @-@ Star Game , though he was included on the Final Vote ballot , which he lost to Cardinals pitcher Carlos Martinez . However , he was added to the roster to replace Nationals pitcher Max Scherzer , who was unavailable due to pitching the Sunday before the game . It became his fifth straight all @-@ star selection , joining Sandy Koufax and Fernando Valenzuela as the only Dodgers pitchers to accomplish that feat . Kershaw struck out a season high 14 batters in eight shutout innings on July 18 against the Washington Nationals . He became the first Dodgers starter with back @-@ to @-@ back games of at least 13 strikeouts since Chan Ho Park in 2000 and the first Dodgers pitcher with back @-@ to @-@ back games of double @-@ digit strikeouts and no walks since Dazzy Vance in 1930 . He shared the NL player of the week honors with his teammate Zack Greinke for the week of July 13 – 19 and won NL pitcher of the month for July . + Kershaw picked up his 200th strikeout of the season on August 12 , tying Hideo Nomo 's 1995 season for the fastest to that mark in Dodgers history at 156 innings . This was the sixth straight 200 strikeout season for Kershaw , tying Sandy Koufax for the most in Dodgers franchise history . On October 4 , Kershaw became the 11th player in Major League history to strike out 300 batters in a season , the first player since Randy Johnson did it in 2002 . He finished the season with a 16 – 7 record , a 2 @.@ 13 ERA , and 301 strikeouts in 232 2 ⁄ 3 innings . + In Game One of the 2015 National League Division Series , Kershaw struck out 11 in 6 2 ⁄ 3 innings but allowed three runs for his fifth straight postseason loss . He and New York Mets starter Jacob deGrom were the first pair of starters to each throw at least 11 strikeouts in the same postseason game in MLB history . He rebounded in game four , earning the win on three days ' rest by allowing one run and three hits against eight strikeouts in seven innings on October 13 . Kershaw finished third in the National League Cy Young Award voting , behind Jake Arrieta and teammate Zack Greinke . In mid @-@ December 2015 , Kershaw participated in an expedition to Cuba composed of MLB officials and players , including former Dodgers manager Joe Torre . It was the first visit by MLB since 1999 , and one anticipated as an important step to help normalize relations with the United States that had begun to ease earlier in the year . + + = = = = 2016 = = = = + + Kershaw made his sixth straight opening day start in 2016 as the Dodgers won 15 – 0 . It also marked the first time the Dodgers had won six straight opening day games , all of which he started . On May 12 against the New York Mets , he struck out 13 while pitching a three @-@ hit complete game shutout . He set an MLB record with six consecutive starts with at least 10 strikeouts and no more than one walk and a club record with six consecutive starts with at least 10 strikeouts . He picked up his 100th strikeout on May 29 , while only walking five batters within that period . That was the lowest walk total for a pitcher reaching 100 strikeouts in the modern era , beating Cliff Lee who had seven walks in the 2010 season . On June 30 , 2016 , Kershaw was placed on the 15 @-@ day disabled list due to back pain . He received an MRI , which revealed that there was a mild herniated disc in the back , and received an epidural injection to treat the pain . He was named to the 2016 All @-@ Star team but was unable to pitch in the game due to his injury . On July 20 , the Dodgers shut down Kershaw for an indefinite period of time . He continued to feel discomfort in his back after a simulated game . + + = = Pitching style = = + + Kershaw 's pitching style relies on deception . He keeps the ball hidden so that it is hard for the batter to pick up the ball and has a consistent overhand delivery on all of his pitches . Out of the windup , Kershaw lowers his right foot vertically with a slight pause before moving it forward toward the plate . The motion was described during the 2015 National League Division Series as a " kickstand move " drawing comparison with one setting a kickstand on a bicycle . Out of the stretch , he uses a slide step as it makes it difficult for the base runner at first base to get a read on him . He has stated many times that he has modeled his pitching mechanics after his favorite pitcher growing up , Roger Clemens . + Kershaw 's repertoire includes a four @-@ seam fastball that sits anywhere from 92 miles per hour ( 148 km / h ) to 95 miles per hour ( 153 km / h ) and tops out at 98 miles per hour ( 158 km / h ) with late movement , a slider at 84 miles per hour ( 135 km / h ) – 87 miles per hour ( 140 km / h ) , a 12 – 6 curveball between 72 miles per hour ( 116 km / h ) – 76 miles per hour ( 122 km / h ) , and a seldom thrown changeup ( under 3 % ) . As of late in the 2015 season , he is believed to be experimenting with the use of a cutter . He is also known for having one of the better pickoff moves to first base and is considered one of the better fielding pitchers in the game . + + = = = Preparation = = = + + According to many teammates , Kershaw is a noted perfectionist . A.J. Ellis describes his preparation and perfectionism during bullpens before each start : + Three fastballs when I 'm standing up . I sit , and three fastballs down the middle . Then three fastballs either side . Three changeups away . Fastball inside . Three curveballs to the middle . Fastball inside . Three sliders to the middle . Then he goes to the stretch position . Two fastballs inside , two fastballs away , two changeups , one fastball inside , two curveballs , one fastball inside , two sliders . Back to the windup , and one fastball inside , one fastball away . Thirty @-@ four pitches in all . + + = = Awards and accomplishments = = + + + = = = Awards = = = + + + = = = Annual statistical achievements = = = + + Notes : Through 2015 season . Per Baseball @-@ Reference.com. + + = = Personal life = = + + Kershaw grew up in Dallas , Texas and attended school with quarterback Matthew Stafford and fellow pitchers Jordan Walden and Shawn Tolleson . One of his favorite players growing up was former Texas Rangers first baseman Will Clark , and the main reason he wears number 22 is to honor Clark . + He is the great @-@ nephew of astronomer Clyde Tombaugh , the discoverer of Pluto . Kershaw 's mother , born Marianne Tombaugh , is the daughter of Clyde Tombaugh 's younger brother . His father , Christopher George Kershaw , was a musician and won a Clio Award for his work . The elder Kershaw remarried after his divorce from Marianne and died in 2013 . + On December 4 , 2010 , Kershaw married his girlfriend of seven years , Ellen Melson . On January 23 , 2015 , Ellen gave birth to the couple 's first child , daughter Cali Ann . He is a Methodist with strong religious faith . + Kershaw made a cameo appearance in " Prince " , a Season 3 episode of New Girl which originally aired following FOX 's telecast of Super Bowl XLVIII . + + = = = Humanitarian work = = = + + Prior to the 2011 season , Kershaw visited Zambia with his wife as part of a Christian mission organized by Dallas @-@ based Arise Africa . After the trip , Kershaw announced his dream of building an orphanage in Lusaka , Zambia , which he called " Hope 's Home " after 11 @-@ year @-@ old Hope , an HIV @-@ positive child Kershaw met while in Zambia . To accomplish his goal , Kershaw pledged a donation of $ 100 per strikeout recorded in 2011 . With Kershaw 's career high of 248 strikeouts thrown during the 2011 season , he donated $ 492 @,@ 300 toward his $ 70 @,@ 000 goal . When Kershaw won the 2011 Players Choice Award , he donated $ 260 @,@ 000 to Hope 's Home . He and his wife returned to Zambia in 2012 . Kershaw donated $ 100 for every strikeout in the 2012 season to Kershaw 's Challenge , calling that season 's incarnation of the project " Strike Out To Serve . " Seventy percent of the money raised in 2012 went to Arise Africa , with 10 percent each going to the Peacock Foundation in Los Angeles , Mercy Street in Dallas , and I Am Second . In 2014 , Kershaw continued to support the children of Zambia , with partnership with CURE International , raising funds to pay for 170 children 's surgeries and new medical equipment for CURE hospital in Lusaka . Kershaw has continued his partnership with CURE International in 2015 , setting a goal of funding 100 surgeries for CURE 's hospital in the Dominican Republic . + In addition to Hope 's Home and Kershaw 's Challenge , he has also helped with other programs in Los Angeles , such as helping Habitat for Humanity demolish and rehabilitate a house in Lynwood , California . He is also a supporter of the Peacock Foundation , which provides animal @-@ assisted interventions and activities for at risk youth by partnering with mental health practitioners , public service agencies and community organizations . + + = = = Author = = = + + Kershaw and his wife , Ellen , co @-@ authored a book named Arise : Live Out Your Faith and Dreams on Whatever Field You Find Yourself about their Christian faith and their humanitarian efforts . The book was released on January 10 , 2012 through Regal Press . + + = = = Endorsements = = = + + Kershaw is a celebrity endorser for www.FantasyDraft.com , Wilson Sporting Goods ( glove ) , Under Armour ( shoes ) , Muscle Milk , and Subway . + + + = Josepha Petrick Kemarre = + + Josepha Petrick Kemarre ( born ca . 1945 or ca . 1953 , date uncertain ) is an Anmatyerre @-@ speaking Indigenous Australian from Central Australia . Since first taking up painting around 1990 , her works of contemporary Indigenous Australian art have been acquired by several major collections including Artbank and the National Gallery of Victoria . Her paintings portray bush plum " dreaming " and women ’ s ceremonies ( known as Awelye ) . One of her paintings sold at a charity auction for A $ 22 @,@ 800 . Josepha Petrick 's works are strongly coloured and formalist in composition and regularly appear at commercial art auctions in Australia . Her art appears to have survived the huge contraction of the primary art market in Australia since 2008 . There is no existing Catalogue raisonné of Josepha Petrick 's artworks , to date , no fakes have been cited . + + = = Personal background = = + + Josepha Petrick Kemarre is an Anmatyerre @-@ speaking Indigenous Australian , born around 1945 or 1953 at the Santa Teresa Mission , near Alice Springs in Australia 's Northern Territory . + When Josepha Petrick began painting for Mbantua Gallery in central Australia , she indicated that her name was Josepha rather than Josie , and that this was how she henceforth wished to be known ; however Mbantua 's biography is the only source that has used that version of her name . + After marrying Robin Petyarre , brother of artist Gloria Petyarre , Josepha Petrick moved to the region of Utopia , north @-@ east of Alice Springs , which is where she was living when she began painting around 1990 . They had seven children , one of whom , Damien Petrick , went on to become an artist like his mother . By 2008 , Josie Petrick 's husband had died , and Petrick was dividing her time between Alice Springs and Harts Range , to its north @-@ east . + + = = Professional background = = + + Contemporary Indigenous art of the western desert began in 1971 when Indigenous men at Papunya created murals and canvases using western art materials , assisted by teacher Geoffrey Bardon . Their work , which used acrylic paints to create designs representing body painting and ground sculptures , rapidly spread across Indigenous communities of central Australia , particularly after the introduction of a government @-@ sanctioned art program in central Australia in 1983 . By the 1980s and ' 90s , such work was being exhibited internationally . The first artists , including all of the founders of the Papunya Tula artists ' company , were men , and there was resistance among the Pintupi men of central Australia to women also painting . However , many of the women wished to participate , and in the 1990s many of them began to paint . In the western desert communities such as Utopia , Kintore , Yuendumu , Balgo , and on the outstations , people were beginning to create art works expressly for exhibition and sale . + + = = = Career = = = + + Josepha Petrick began painting about 1990 or 1992 as part of the contemporary Indigenous art movement that had begun at Papunya in the 1970s . By 1998 her work was being collected by both private and public institutions , such as Charles Sturt University , and in 2005 a work was purchased by the National Gallery of Victoria . Her career received a significant boost when her work was included in the National Gallery of Victoria 's 2006 Landmarks exhibition and its catalogue ; her painting was printed opposite that of Yannima Tommy Watson , who was by this time famous , particularly for his contribution to the design of a new building for the Musée du quai Branly . Petrick 's paintings have been included at exhibitions in several private galleries in Melbourne and Hong Kong , as well as at the Australian embassy in Washington in 2001 . + In 2006 a commissioned work by Petrick was exhibited at Shalom College at the University of New South Wales as part of a charity fundraising exhibition . It sold for A $ 22 @,@ 000 . As of the end of 2008 , the highest recorded auction price for an item of Petrick 's work was $ 22 @,@ 800 , set in May 2007 . An image based on a triptych by Petrick , Bush Berries , appears on the cover of a book on the visual perception of motion , Motion Vision . + Central Australian artists frequently paint particular " dreamings " , or stories , for which they have responsibility or rights . These stories are used to pass " important knowledge , cultural values and belief systems " from generation to generation . Paintings by Petrick portray two different groups of dreamings , rendered in two distinct styles . Bush plum dreaming represents a plant of the central Australian desert which is " a source of physical and spiritual sustenance , reminding [ the local Indigenous people ] of the sacredness of [ their ] country " . These paintings are undertaken with red , blue and orange dots that represent the fruit at different stages in its development . She also paints women ’ s ceremonies ( Awelye ) and dreamings , and these are created using rows of coloured dots and include representations of women 's ceremonial iconography . + Journalist Zelda Cawthorne described Petrick as one of the " finest contemporary Aboriginal artists " . Art consultant Adrian Newstead has ranked her as amongst the country 's top 200 Indigenous artists , noting that she has become " known for innovative works that create a sense of visual harmony through fine variegated fields of immaculately applied dotting " . Her style is described by Indigenous art writers Birnberg and Kreczmanski as an " interesting , modern interpretation of landscape " . + Petrick 's work is held in a variety of public and private collections , including Artbank , the Charles Sturt University Collection , the Holmes a Court Collection , and the National Gallery of Victoria . + + + = Head VI = + + Head VI is an oil @-@ on @-@ canvas painting by the Irish @-@ born English figurative artist Francis Bacon , the last of six panels making up his " 1949 Head " series . It shows a bust view of a single figure , modeled on Diego Velázquez 's Portrait of Innocent X. Bacon applies forceful , expressive brush strokes , and places the figure within a glass cage structure , behind curtain @-@ like drapery . This gives the effect of a man trapped and suffocated by his surroundings , screaming into an airless void . + Head VI was the first of Bacon 's paintings to reference Velázquez , whose portrait of Pope Innocent X haunted him throughout his career and inspired his series of " screaming popes " , a loose series of which there are around 45 surviving individual works . Head VI contains many motifs that were to reappear in Bacon 's work . The hanging object , which may be a light switch or curtain tassel , can be found even in his late paintings . The geometric cage is a motif that appears as late as his 1985 – 86 masterpiece , Study for a Self @-@ Portrait — Triptych . + Head VI was first exhibited in November 1949 at the Hanover Gallery in London , in a showing organised by one of the artist 's early champions , Erica Brausen . At the time , Bacon was a highly controversial but respected artist , best known for his 1944 Three Studies for Figures at the Base of a Crucifixion , which made him the enfant terrible of British art . Head VI drew a mixed reaction from art critics ; John Russell , later Bacon 's biographer , at the time dismissed it as a cross between " an alligator shorn of its jaws and an accountant in pince @-@ nez who has come to a bad end " . In 1989 Lawrence Gowing wrote that the " shock of the picture , when it was seen with a whole series of heads ... was indescribable . It was everything unpardonable . The paradoxical appearance at once of pastiche and iconoclasm was indeed one of Bacon 's most original strokes . " Art critic and curator David Sylvester described it as a seminal piece from Bacon 's unusually productive 1949 – 50 period , and one of Bacon 's finest popes . + + = = 1949 Head series = = + + Bacon 's output is characterised by sequences of images . He told Sylvester that his imagination was stimulated by sequences and that " images breed other images in me " . His series were not always planned or painted in sequence ; sometimes paintings are grouped for convenience but vary in execution and tone . The idea for the head series came after he returned penniless , late in 1948 , from a stay in Tangier . In the previous three years he had been unable to find a voice ; the last surviving canvas from this period is his Painting ( 1946 ) . Although he continued to paint , he was a ruthless self critic , given to slashing canvases with blades , and no works survive from between 1947 and the winter of 1948 . Gallerist Erica Brausen offered Bacon the opportunity of a solo show for the opening of her new Hanover Gallery . He agreed , but had nothing in reserve to hang . In following years , Brausen became perhaps the most important of Bacon 's early champions ; she arranged this showing — his debut solo exhibition — publicised him widely and organised viewings for international buyers . + Already 40 years old , Bacon viewed the exhibition as his last chance and applied himself to the task with determination . Because he had destroyed all his out of the last three years , he had little choice but to present new works . He did not have a grand plan when he agreed to the show , but eventually found themes that interested him in his Head I of the previous year , and executed five progressively stronger variants in the final weeks before the November exhibition , completing the series barely in time for the opening . + The paintings depict isolated figures enclosed in spaces that are undefined , overwhelmingly claustrophobic , reductive and eerie . Coming early in Bacon 's career , they are uneven in quality , but show a clear progression especially in how they utilise and present ideas he was still clearly developing and coming to terms with . Head I ( actually begun in the winter of 1948 ) and Head II show formless pieces of flesh that broadly resemble human heads ; they have half @-@ open eyes and a pharynx , though it is positioned much higher than would be expected in a human . Heads III , IV and V show fully formed busts recognisable as men , and are characterised by a haunted atmosphere . These two broad ideas coalesce in Head VI , which is as physiologically tortured as the first two paintings , and as spectral as the middle three . In Head VI the figure has developed and is now shown wearing vestments , the first indication in Bacon 's work of the influence of Velázquez , while the focus has become the open mouth and the study of the human scream . + Bacon said that chance played a significant role in his work , and that he often approached a canvas without having a clear idea of what might emerge . This was especially the case in the mid to late 1940s , a period when he was drinking heavily and spending most nights in Soho casinos and poker rooms . The following morning he would often approach his canvas " in a bad mood of drinking ... under tremendous hangovers and drink ; I sometimes hardly knew what I was doing . " He incorporated his appetite for chance into his work : an image often would morph mid @-@ way through into something quite different from what he had first intended . He actively sought out this freedom and felt it crucial to his progression as an artist . To him , lifestyle and art were intertwined ; he said that " perhaps the drink helped me to be a bit freer . " This is very evident in the 1949 series , which began as a rather morbid study of a collapsed head , but evolved over the six surviving panels into a reworking of Velázquez masterpieces , and arrived at an image that was to preoccupy Bacon for the subsequent 20 years . + The series marks Bacon 's first attempt at depicting lone figures in rooms . For him , the key aspect was that it appeared that the subject felt isolated , unobserved , and had abandoned the need to present an outward face . He believed that under these circumstances all pretence falls away , and the social being becomes the sum of its neuroses , which Bacon attempted to convey by reducing the subject to its bare @-@ bones features : a mouth , ears , eyes , a jaw . According to Russell , " the view out front ceases to be the only one , and our person is suddenly adrift , fragmented , and subject to strange mutation . " Russell observed that while the depiction of figures in rooms is common through all eras of painting , the figures are always posed , and usually seemingly aware that they are being portrayed . This conceit is abandoned in Bacon 's series . + Head I , completed late in 1948 , is considered more successful than Head II . Although it is well @-@ regarded critically , Head II is seen as something of a creative cul @-@ de @-@ sac , while Heads III , IV and V are usually considered as merely intermediate steps towards Head VI . It is exceptional in Bacon 's oeuvre that works of their relative poor quality survive ; he was ruthlessly self @-@ critical and often slashed or abandoned canvasses before they were completed . When pressed again by Brausen in 1953 to produce works for a New York show that she had been publicising for a year , he was full of doubt and destroyed most of what he had been working on , including several other popes . + Brausen commissioned another showing to be held in 1950 , for which Bacon painted three large popes modelled on Velázquez 's portrait . The gallery advertised the show as " Francis Bacon : Three Studies from the Painting of Innocent X by Velázquez " , but in the end Bacon was dissatisfied with the works and destroyed them before the show opened . + + = = Description = = + + The figure is clearly identifiable as a pope from his clothing . It seems trapped and isolated within the outlines of an abstract three @-@ dimensional glass cage . This framing device , described by Sylvester as a " space @-@ frame " , was to feature heavily throughout the artist 's career . A cord hangs from the upper edge of the glass case , falling just in front of the pope 's face and partially covering his eyes . It is too indistinctly drawn to identify with certainty , but given the presence of similar objects in Bacon 's later works , may be either the end of a hanging light switch or the tassel of a curtain ; the hanging cord was to become a signature for the artist . Apart from its symbolic meaning , it has a compositional function , framing the painting with a further set of vertical lines . Such an object reappears most prominently in the centre panel of his 1973 Triptych , May – June 1973 , where it is clearly a dangling light bulb . For Bacon , these elements were intended to make the figure waver in and out of sight for the viewer , alluding to the fact that bulbs can be on or off , curtains open or closed . + The figure 's mouth is opened wide as if screaming , an expression Bacon took from a still he kept of the nurse screaming in Sergei Eisenstein 's Odessa Steps massacre sequence in his 1925 silent film Battleship Potemkin . In 1984 , the broadcaster Melvyn Bragg asked Bacon about the still , and observed that in his earlier career the artist seemed preoccupied with the physicality of the human mouth . Bacon replied , " I had always thought that I would be able to make the mouth with all the beauty of a Monet landscape though I never succeeded in doing so . " When Bragg asked why he thought he had failed , Bacon said , " It should be all much more colour , should have got more of the interior of the mouth , with all the colours of the interior of the mouth , but I didn 't happen to get it . " His interest in the mouth was further stimulated by a medical textbook of diseased oral cavities bought in a second @-@ hand bookshop , kept in his studio and to which he often referred to . + The glass cage might imply a vacuum that the figure 's voice is unable to escape ; as if it is screaming in silence . Rueful later in life , Bacon said that he had " wanted to paint the scream more than the horror . I think , if I had really thought about what causes somebody to really scream , it would have made the scream ... more successful " . The work evokes memories of the Second World War . The glass enclosure of his 1949 Chicago Study for a Portrait is often seen as prophesying photographs of Adolf Eichmann 1961 trial before a Jerusalem District Court , when he was held within a similar cage . Bacon strongly resisted literal comparisons though , and stated that he used the device so he could frame and " really see the image – for no other reason . I know it 's been interpreted as being many other things . " Other critics saw similarities between the glass case and the radio booths of late 1930s broadcasters who warned against the impending calamity . Denis Farr notes that Bacon was sympathetic to George Orwell and referred in interviews to Orwellian " shouting voices ... and trembling hands ... convey [ ing ] the harsh atmosphere of an interrogation . " + + = = Influences = = + + The so @-@ called " space frame " had already been used by Alberto Giacometti in the 1930s , and the two artists became friends in the 1960s . However Giacometti had by 1949 used it only in surrealist contexts before Bacon 's adaption , and in turn influenced his use in " The Cage " of 1950 . A similar two dimensional construct is found in Henry Moore 's works , notably his " Maguette for King and Queen " , constructed three years after Bacon 's Head . It is difficult to untangle how these artists influenced and informed each other . What is notable is that Bacon continued to use the motif , with intervals until the end of his life . Sylvester suggests his finest example is the 1970 Three Studies of the Male Back . + The full @-@ length golden curtain @-@ like folds painted in heavy brush strokes are in part influenced by Degas but also similar to Titian 's 1558 Portrait of Cardinal Filippo Archinto . Bacon adapts the Old Master 's device to isolate and distance the sitter from the viewer ; the black ground @-@ paint is visible through the folds , making the separation all the more affecting . Bacon had already used similar forms in his Chicago panel , and they were to become a feature of his most acclaimed 1950s works , especially in his " screaming popes " . He became fascinated with the veil or curtain as a motif in painting , and collected many reproductions of works by Titian and Degas in which it is employed . He had begun his career as an interior decorator and designer of furniture and rugs in the mid @-@ 1930s , and later said that he liked " rooms hung all round with just curtains hung in even folds " . Veils or curtains appear in Bacon 's earliest works , notably the 1949 Study from the Human Body , always in portraits and always in front of , rather than behind , the figure . + Head VI is closely modelled on Velázquez 's c . 1650 Portrait of Innocent X , today in the Doria Pamphilj Gallery , Rome . Bacon cautiously avoided seeing the original , even when he spent three months in Rome in 1954 . Critics speculate he was afraid of being disappointed , or thought that an intimate knowledge of the painting would dull his imagination . Yet his fascination was all @-@ consuming and he reproduced variants of it obsessively for almost two decades ; an examination and homage described as " without parallel in the history of art " . Bacon 's approach differs to Velázquez 's in a number of ways : both artists were expressive , yet Bacon 's broad brush @-@ strokes and freedom with paint contrast with Velázquez 's tight and controlled treatment . He adapts Velázquez 's positioning of the pope to place him above the viewer 's point of view , elevating and distancing him . This was already a common technique in commercial , promotional photography but in Bacon 's hands , Schmied argues , the angle places the pope on a kind of stage for the viewer to coldly observe . + Although Bacon revered Velázquez 's portrait , he did not try and reproduce the earlier painting . In interviews , he said that he saw flaws in Velázquez 's work and that he viewed that social structure and order as , according to art historian Wieland Schmied , " obsolete and decayed " . Bacon 's approach was to elevate his subject so he could knock him down again , thereby making a sly comment on the treatment of royalty in both old master and contemporary painting . Yet Velázquez 's influence is apparent in many aspects of the painting . The sitter 's pose closely echoes the original , as does the violet and white colouring of his cope , which is built up through broad , thick , brush @-@ strokes . The influence can be further seen in the gold @-@ coloured ornaments on the back of the seat that extend on both sides of the figure . Art historian Armin Zweite describes the work as a mixture of reverence and subversion that pays tribute to Velázquez , while at the same time deconstructs his painting . + Sylvester detects the influence of late works by Titian in other aspects , especially in the deep and rich colouring , Velázquez 's portrayals of Philip IV , and agrees with identification of pastels of Edgar Degas as a source . He believes Bacon borrowed from Degas the use of parallel heavy folds to create the illusion of what Degas described as " shuttering " , as seen in the earlier artist 's After the Bath , Woman drying herself . Sylvester makes a further direct link between the folds and the transparent veil in Titian 's Portrait of Cardinal Filippo Archinto . He believes the folds serve to " push the viewer back " , creating a distance from the subject , an effect he sees as similar to the separation between and orchestra and setting ; others view the folds as more closely resembling the bars of a prison . Sylvester describes them as an accentuation of background verticals into stripes that are made to appear as if they pass through the sitter . In his " Interviews with Francis Bacon " series of books , he asked Bacon why he found the effect so poignant . The artist replied , " Well , it means that the sensation doesn 't come straight out at you but slides slowly and gently through the gaps . " + When asked why he was compelled to revisit the Velázquez so often , Bacon replied that he had nothing against popes per se , but merely sought " an excuse to use these colours , and you can 't give ordinary clothes that purple colour without getting into a sort of false fauve manner . " Schmied sees Head VI as a reaction against Velázquez , and a commentary on how the papacy is " obsolete and decayed " , with a pope resistant to both modernisation and secularisation . To him , the figure seems to " resist the maltreatment of image and tries to halt the impending collapse of the established work order . He screams and grimaces , clutching at arms of his throne . " Sylvester notes that Bacon was impressed by Picasso 's figuration and handling of paint , especially in Picasso 's 1930s works ; and suggests that the white blobs around the pope 's cape may be influenced by the 1913 Woman in a Slip Seated in an Armchair . + + = = Critical reception = = + + When Bacon undertook the series late in 1948 he was something of a two @-@ hit wonder . He had success in 1944 with Three Studies for Figures at the Base of a Crucifixion and to a lesser extent with Painting ( 1946 ) , both of which were highly regarded but viewed as sensationalist . The exhibition was a success , and marked his critical breakthrough . Until then , he had been highly regarded but capable of only occasional brilliance . The full show established him in the minds of critics as , according to Michael Peppiatt , " more of a force to be reckoned with in the contemporary scene " . While some found his images horrifying and unnerving , they wrote about him all the same , sealing his reputation as the enfant terrible of post @-@ war British art . The critic for The Observer wrote , " The recent paintings ... horrifying as they are , cannot be ignored . Technically they are superb , and the masterly handling of large areas of pearly grey , flushed with a sudden pink or green , only makes me regret the more that the artist 's gift should have been brought to subjects so esoteric " . + Most critics focused on Heads I and VI , remarking favourably on the progression between the two . While some found the inherent violence of the paintings distasteful , Brausen was a skilled publicist and turned the bad press into notoriety , and brought Bacon 's work to national attention . Peppiatt notes that the exhibition showed Bacon no longer needed sensationalist material to make an impact , and was now capable of creating an intense emotional response through more subtle means , and had found a way of presenting the human condition in the way he had sought , by presenting his sitter " in a vestigial setting , a cage or [ behind ] a parted curtin ... the rest , the most essential , lay in the manipulation of the infinitely suggestive medium of oil paint " . After the showing Bacon gradually became " less the outsider with an occasional image of horrifying brilliance and more a force to be reckoned with on the contemporary scene " . His reputation and the value of his panels rose dramatically , and after the showing he was sought after by European , American and African collectors and galleries , commanding prices as high as £ 400 for single works , unusual for a contemporary British artist of the time . + + = = Provenance = = + + Head VI was first exhibited at the Hanover Gallery , London , in 1949 . It was acquired by the Arts Council 's Hayward Gallery in 1952 . The Hayward has loaned it out a number of times since , including for major retrospectives at the Grand Palais , Paris in 1971 , and the Hugh Lane Gallery , Dublin , in 2000 . + In May 1996 , the National Gallery took on loan Velázquez 's Innocent X portrait and hung it alongside four Bacon paintings ; Head VI , Pope I ( 1951 ) , Pope 1961 and Pope 1965 . Peppiatt believes that Bacon would have disapproved of such a showing with a work he considered one of the finest ever painted , but writes that two , including Head VI , " stood up to it , and even enhanced its authority as one of the most penetrating studies of human nature and human power " . + + + = Imagism = + + Imagism was a movement in early 20th @-@ century Anglo @-@ American poetry that favored precision of imagery and clear , sharp language . + Imagism has been described as the most influential movement in English poetry since the activity of the Pre @-@ Raphaelites . As a poetic style it gave Modernism its start in the early 20th century , and is considered to be the first organized Modernist literary movement in the English language . Imagism is sometimes viewed as ' a succession of creative moments ' rather than any continuous or sustained period of development . René Taupin remarked that ' It is more accurate to consider Imagism not as a doctrine , nor even as a poetic school , but as the association of a few poets who were for a certain time in agreement on a small number of important principles ' . + The Imagists rejected the sentiment and discursiveness typical of much Romantic and Victorian poetry , in contrast to their contemporaries , the Georgian poets , who were generally content to work within that tradition . In contrast , Imagism called for a return to what were seen as more Classical values , such as directness of presentation and economy of language , as well as a willingness to experiment with non @-@ traditional verse forms . Imagists use free verse . + Imagist publications appearing between 1914 and 1917 featured works by many of the most prominent modernist figures , both in poetry and in other fields . The Imagist group was centered in London , with members from Great Britain , Ireland and the United States . Somewhat unusually for the time , a number of women writers were major Imagist figures . + A characteristic feature of Imagism is its attempt to isolate a single image to reveal its essence . This feature mirrors contemporary developments in avant @-@ garde art , especially Cubism . Although Imagism isolates objects through the use of what Ezra Pound called " luminous details " , Pound 's Ideogrammic Method of juxtaposing concrete instances to express an abstraction is similar to Cubism 's manner of synthesizing multiple perspectives into a single image . + + = = Pre @-@ Imagism = = + + Well @-@ known poets of the Edwardian era of the 1890s , such as Alfred Austin , Stephen Phillips , and William Watson , had been working very much in the shadow of Tennyson , producing weak imitations of the poetry of the Victorian era . They continued to work in this vein into the early years of the 20th century . As the new century opened , Austin was still the serving British Poet Laureate , a post which he held up to 1913 . In the century 's first decade , poetry still had a large audience ; volumes of verse published in that time included Thomas Hardy 's The Dynasts , Christina Rossetti 's posthumous Poetical Works , Ernest Dowson 's Poems , George Meredith 's Last Poems , Robert Service 's Ballads of a Cheechako and John Masefield 's Ballads and Poems . Future Nobel Prize winner William Butler Yeats was devoting much of his energy to the Abbey Theatre and writing for the stage , producing relatively little lyric poetry during this period . In 1907 , the Nobel Prize for Literature was awarded to Rudyard Kipling . + The origins of Imagism are to be found in two poems , Autumn and A City Sunset by T. E. Hulme . These were published in January 1909 by the Poets ' Club in London in a booklet called For Christmas MDCCCCVIII . Hulme was a student of mathematics and philosophy ; he had been involved in the setting up of the club in 1908 and was its first secretary . Around the end of 1908 , he presented his paper A Lecture on Modern Poetry at one of the club 's meetings . Writing in A. R. Orage 's magazine The New Age , the poet and critic F. S. Flint ( a champion of free verse and modern French poetry ) was highly critical of the club and its publications . From the ensuing debate , Hulme and Flint became close friends . In 1909 , Hulme left the Poets ' Club and started meeting with Flint and other poets in a new group which Hulme referred to as the " Secession Club " ; they met at the Eiffel Tower restaurant in London 's Soho to discuss plans to reform contemporary poetry through free verse and the tanka and haiku and the removal of all unnecessary verbiage from poems . The interest in Japanese verse forms can be placed in a context of the late Victorian and Edwardian revival of interest in Chinoiserie and Japonism as witnessed in the 1890s vogue for William Anderson 's Japanese prints donated to the British Museum , performances of Noh plays in London , and the success of Gilbert and Sullivan 's operetta The Mikado ( 1885 ) . Direct literary models were available from a number of sources , including F. V. Dickins 's 1866 Hyak nin is 'shiu , or , Stanzas by a Century of Poets , Being Japanese Lyrical Odes , the first English @-@ language version of the Hyakunin isshu , a 13th @-@ century anthology of 100 waka , the early 20th @-@ century critical writings and poems of Sadakichi Hartmann , and contemporary French @-@ language translations . + The American poet Ezra Pound was introduced to the group in April 1909 and found that their ideas were close to his own . In particular , Pound 's studies of Romantic literature had led him to an admiration of the condensed , direct expression that he detected in the writings of Arnaut Daniel , Dante , and Guido Cavalcanti , amongst others . For example , in his 1911 – 12 series of essays I gather the limbs of Osiris , Pound writes of Daniel 's line " pensar de lieis m 'es repaus " ( " it rests me to think of her " ) ( from the canzone En breu brizara 'l temps braus ) : " You cannot get statement simpler than that , or clearer , or less rhetorical " . These criteria of directness , clarity and lack of rhetoric were to be amongst the defining qualities of Imagist poetry . Through his friendship with Laurence Binyon , Pound had already developed an interest in Japanese art by examining Nishiki @-@ e prints at the British Museum , and he quickly became absorbed in the study of related Japanese verse forms . + In an article in La France , 1915 , the French critic , Remy de Gourmont described the Imagists as descendants of the French Symbolistes and in a 1928 letter to the French critic and translator René Taupin , Pound was keen to emphasise another ancestry for Imagism , pointing out that Hulme was indebted to a Symbolist tradition , linking back via William Butler Yeats , Arthur Symons and the Rhymers ' Club generation of British poets to Mallarmé. and the Symbolist source was amplified further in Taupin 's study published in 1929 , in which he concluded however great the divergence of technique and language ' between the image of the Imagist and the ' symbol ' of the Symbolists there is a difference only of precision ' . In 1915 , Pound edited the poetry of another 1890s poet , Lionel Johnson for the publisher Elkin Mathews . In his introduction , he wrote + + = = Early publications and statements of intent = = + + In 1911 , Pound introduced two other poets to the Eiffel Tower group : his former fiancée Hilda Doolittle ( who had started signing her work H.D. ) and her future husband Richard Aldington . These two were interested in exploring Greek poetic models , especially Sappho , an interest that Pound shared . The compression of expression that they achieved by following the Greek example complemented the proto @-@ Imagist interest in Japanese poetry , and , in 1912 , during a meeting with them in the British Museum tea room , Pound told H.D. and Aldington that they were Imagistes and even appended the signature H.D. Imagiste to some poems they were discussing . + When Harriet Monroe started her Poetry magazine in 1911 , she had asked Pound to act as foreign editor . In October 1912 , he submitted thereto three poems each by H.D. and Aldington under the Imagiste rubric , ( published in the November 1912 second issue thereof ) with a note which described Aldington as ' one of the ' Imagistes ' . This note , along with the appendix note ( ' The Complete Works of T. S. Hulme ' ) in Pound 's book ( also published in Autumn 1912 ) entitled Ripostes are considered to be first appearances of the word Imagiste ( later anglicised to ' Imagists ' ) in print . + Aldington 's poems , Choricos , To a Greek Marble , and Au Vieux Jardin , were in the November issue of Poetry , and H.D. ' s , Hermes of the Ways , Priapus , and Epigram , appeared in the January 1913 issue ; Imagism as a movement was launched . Poetry 's April issue published what came to be seen as " Imagism 's enabling text " , the haiku @-@ like poem of Ezra Pound entitled " In a Station of the Metro " : + The apparition of these faces in the crowd ; + Petals on a wet , black bough . + The March 1913 issue of Poetry contained A Few Don 'ts by an Imagiste and the essay entitled Imagisme both written by Pound , with the latter being attributed to Flint . The latter contained this succinct statement of the group 's position : + Direct treatment of the " thing " , whether subjective or objective . + To use absolutely no word that does not contribute to the presentation . + As regarding rhythm : to compose in sequence of the musical phrase , not in sequence of the metronome . + Pound 's note opened with a definition of an image as " that which presents an intellectual and emotional complex in an instant of time " . Pound goes on to state , " It is better to present one Image in a lifetime than to produce voluminous works " . His list of " don 'ts " reinforced his three statements in " Imagism " , while warning that they should not be considered as dogma but as the " result of long contemplation " . Taken together , these two texts comprised the Imagist programme for a return to what they saw as the best poetic practice of the past . F.S. Flint commented " we have never claimed to have invented the moon . We do not pretend that our ideas are original . " + The 1916 preface to Some Imagist Poets comments " Imagism does not merely mean the presentation of pictures . Imagism refers to the manner of presentation , not to the subject . " + + = = Des Imagistes = = + + Determined to promote the work of the Imagists , and particularly of Aldington and H.D. , Pound decided to publish an anthology under the title Des Imagistes . It was first published in Alfred Kreymborg 's little magazine The Glebe and was later published in 1914 by Alfred and Charles Boni in New York and by Harold Monro at the Poetry Bookshop in London . It became one of the most important and influential English @-@ language collections of modernist verse . Included in the thirty @-@ seven poems were ten poems by Aldington , seven by H.D. , and six by Pound . The book also included work by F.S. Flint , Skipwith Cannell , Amy Lowell , William Carlos Williams , James Joyce , Ford Madox Ford , Allen Upward and John Cournos.Max Michelson was also another included in the important 1963 anthology by William Pratt The Imagist Poem Modern Poetry in miniature . + Pound 's editorial choices were based on what he saw as the degree of sympathy that these writers displayed with Imagist precepts , rather than active participation in a group as such . Williams , who was based in the United States , had not participated in any of the discussions of the Eiffel Tower group . However , he and Pound had long been corresponding on the question of the renewal of poetry along similar lines . Ford was included at least partly because of his strong influence on Pound , as the younger poet made the transition from his earlier , Pre @-@ Raphaelite @-@ influenced style towards a harder , more modern way of writing . The inclusion of a poem by Joyce , I Hear an Army , which was sent to Pound by W.B. Yeats , took on a wider importance in the history of literary modernism , as the subsequent correspondence between the two led to the serial publication , at Pound 's behest , of A Portrait of the Artist as a Young Man in The Egoist . Joyce 's poem is not written in free verse , but in rhyming quatrains . However , it strongly reflects Pound 's interest in poems written to be sung to music , such as those by the troubadours and Guido Cavalcanti . The book met with little popular or critical success , at least partly because it had no introduction or commentary to explain what the poets were attempting to do , and a number of copies were returned to the publisher . + + = = Some Imagist Poets = = + + The following year , Pound and Flint fell out over their different interpretations of the history and goals of the group arising from an article on the history of Imagism written by Flint and published in The Egoist in May 1915 . Flint was at pains to emphasise the contribution of the Eiffel Tower poets , especially Edward Storer . Pound , who believed that the " Hellenic hardness " that he saw as the distinguishing quality of the poems of H.D. and Aldington was likely to be diluted by the " custard " of Storer , was to play no further direct role in the history of the Imagists . He went on to co @-@ found the Vorticists with his friend , the painter and writer Wyndham Lewis . + Around this time , the American Imagist Amy Lowell moved to London , determined to promote her own work and that of the other Imagist poets . Lowell was a wealthy heiress from Boston whose brother Abbott Lawrence Lowell was President of Harvard University from 1909 @-@ 1933 . She loved Keats and cigars . She was also an enthusiastic champion of literary experiment who was willing to use her money to publish the group . Lowell was determined to change the method of selection from Pound 's autocratic editorial attitude to a more democratic manner . This new editorial policy was stated in the Preface to the first anthology to appear under her leadership : " In this new book we have followed a slightly different arrangement to that of our former Anthology . Instead of an arbitrary selection by an editor , each poet has been permitted to represent himself by the work he considers his best , the only stipulation being that it should not yet have appeared in book form . " The outcome was a series of Imagist anthologies under the title Some Imagist Poets . The first of these appeared in 1915 , planned and assembled mainly by H.D. and Aldington . Two further issues , both edited by Lowell , were published in 1916 and 1917 . These three volumes featured most of the original poets , ( also including imagist poetry by the American poet John Gould Fletcher ) , with the exception of Pound , who had tried to persuade her to drop the Imagist name from her publications and who sardonically dubbed this phase of Imagism " Amy @-@ gism . " + Lowell persuaded D. H. Lawrence to contribute poems to the 1915 and 1916 volumes , making him the only writer to publish as both a Georgian poet and an Imagist . Marianne Moore also became associated with the group during this period . However , with World War I as a backdrop , the times were not easy for avant @-@ garde literary movements ( Aldington , for example , spent much of the war at the front ) , and the 1917 anthology effectively marked the end of the Imagists as a movement . + + = = Imagists after Imagism = = + + In 1929 , Walter Lowenfels jokingly suggested that Aldington should produce a new Imagist anthology . Aldington , by now a successful novelist , took up the suggestion and enlisted the help of Ford and H.D. The result was the Imagist Anthology 1930 , edited by Aldington and including all the contributors to the four earlier anthologies with the exception of Lowell , who had died , Cannell , who had disappeared , and Pound , who declined . The appearance of this anthology initiated a critical discussion of the place of the Imagists in the history of 20th @-@ century poetry . + Of the poets who were published in the various Imagist anthologies , Joyce , Lawrence and Aldington are now primarily remembered and read as novelists . Marianne Moore , who was at most a fringe member of the group , carved out a unique poetic style of her own that retained an Imagist concern with compression of language . William Carlos Williams developed his poetic along distinctly American lines with his variable foot and a diction he claimed was taken " from the mouths of Polish mothers " . Both Pound and H.D. turned to writing long poems , but retained much of the hard edge to their language as an Imagist legacy . Most of the other members of the group are largely forgotten outside the context of the history of Imagism . + + = = Legacy = = + + Despite the movement 's short life , Imagism would deeply influence the course of modernist poetry in English . Richard Aldington , in his 1941 memoir , writes : " I think the poems of Ezra Pound , D.H. , Lawrence , and Ford Madox Ford will continue to be read . And to a considerable extent T. S. Eliot and his followers have carried on their operations from positions won by the Imagists . " + On the other hand , Wallace Stevens found shortcomings in the Imagist approach : " Not all objects are equal . The vice of imagism was that it did not recognize this . " With its demand for hardness , clarity and precision and its insistence on fidelity to appearances coupled with its rejection of irrelevant subjective emotions Imagism had later effects that are demonstratable in T. S. Eliot 's ' Preludes ' and ' Morning at the Window ' and in D. H. Lawrence 's animal and flower pieces . The rejection of conventional verse forms in the nineteen @-@ twenties owed much to the Imagists repudiation of the Georgian Poetry style . + The influence of Imagism can be seen clearly in the work of the Objectivist poets , who came to prominence in the 1930s under the auspices of Pound and Williams . The Objectivists worked mainly in free verse . Clearly linking Objectivism 's principles with Imagism 's , Louis Zukofsky insisted , in his introduction to the 1931 Objectivist issue of Poetry , on writing " which is the detail , not mirage , of seeing , of thinking with the things as they exist , and of directing them along a line of melody . " Zukofsky was a major influence on the Language poets , who carried the Imagist focus on formal concerns to a high level of development . Basil Bunting , another Objectivist poet , was a key figure in the early development of the British Poetry Revival , a loose movement that also absorbed the influence of the San Francisco Renaissance poets . + Imagism influenced a number of poetry circles and movements.With the Imagists Free verse became a discipline and acquired status as a legitimate poetic form . In the 1950s , especially , with the Beat generation , the Black Mountain poets , and others associated with the San Francisco Renaissance . In his seminal 1950 essay Projective Verse , Charles Olson , the theorist of the Black Mountain group , wrote " ONE PERCEPTION MUST IMMEDIATELY AND DIRECTLY LEAD TO A FURTHER PERCEPTION " ; his credo derived from and supplemented the Imagists . + Among the Beats , Gary Snyder and Allen Ginsberg in particular were influenced by the Imagist emphasis on Chinese and Japanese poetry . William Carlos Williams was another who had a strong effect on the Beat poets , encouraging poets like Lew Welch and writing an introduction for the book publication of Ginsberg 's Howl ( 1955 ) . + + + = Operation Eastern Exit = + + Operation Eastern Exit was the codename given to the military evacuation of the United States embassy in Mogadishu , the capital of Somalia , in January 1991 . In late December 1990 , violence quickly enveloped the city as armed militants began clashing with government soldiers . On 1 January 1991 , the US Ambassador to Somalia , James Keough Bishop , contacted the Department of State requesting an evacuation of the embassy , which was approved the following day . United States Central Command began planning and mobilizing forces that evening . The initial plan was to evacuate with a military transport plane through the Mogadishu International Airport , but this was later abandoned . A helicopter evacuation via the USS Guam and USS Trenton was the remaining option . + On the morning of 5 January , a 60 @-@ person Marine and Navy SEAL security detail was dispatched from Guam aboard two CH @-@ 53E Super Stallion helicopters to secure the embassy and prepare for the main evacuation . The two helicopters returned to Guam with the first 61 evacuees . Throughout the day , foreign diplomats and civilians sought refuge at the embassy . Four waves of five CH @-@ 46 Sea Knight helicopters each evacuated the embassy compound shortly after midnight on 6 January . The evacuees were transported to Muscat , Oman , where they disembarked on 11 January . In total , 281 diplomats and civilians from 30 countries were evacuated , including 12 heads of missions ( eight ambassadors and four chargés d 'affaires ) . + + = = Background = = + + In the late 1980s , there was increasing rebellion against the rule of Somali President Siad Barre , a military dictator who maintained tight control of power and had a record of human rights abuses . By 1990 , what began as civil disobedience evolved into a civil war , with several militias organized to overthrow the central government . + In July 1989 , the embassy moved to a new , 80 @-@ acre ( 32 ha ) compound , 6 miles ( 9 @.@ 7 km ) from the previous embassy and James K. Bishop was appointed as the United States ' ambassador to Somalia . Ambassador Bishop had significant experience in crisis management at US embassies . In 1967 , he was at the US Embassy in Beirut , Lebanon when the Six @-@ Day War erupted . About 3 @,@ 600 Americans were evacuated in 33 hours ; Bishop was one of 26 diplomats and soldiers that remained in the city . As deputy assistant secretary of state for Africa from 1981 – 87 , Bishop chaired several task forces for crises and gained experience in the State Department 's operations center as evacuations were carried out during several coups d 'etat . During his previous assignment as Ambassador to Liberia ( 1987 – 90 ) , Bishop was overseeing the voluntary evacuation of embassy staff and civilians as a civil war in Liberia spread , when he left in March 1990 . Soon after returning to Washington to prepare for his new appointment to Somalia , he was appointed to a taskforce to deal with the crisis in Liberia , which included a gradual evacuation of American civilians and a rapid closure of the embassy in August . + On 1 August , before leaving the US to take up his post in Mogadishu , Ambassador Bishop visited United States Central Command — the military command for the Middle East and northeast Africa — where he spent most of the day with its commander , Gen. Norman Schwarzkopf . Ambassador Bishop , aware of the ongoing strife , believed " the odds were better than even that we would have to leave Mogadishu under less than favorable circumstances . " Ambassador Bishop understood from his past experiences in Beirut and Liberia the importance of being prepared to deal with emergencies and spent the afternoon working with military experts to review the embassy 's Emergencies and Evacuation ( E & E ) plan until he was " satisfied ... that [ Central Command ] realized that it might have to conduct an evacuation from Mogadishu and was prepared to do that . " In its analysis of Operation Eastern Exit , the Center for Naval Analyses cited the Ambassador Bishop 's previous experience and " clear understanding of his role " in the operation as one of the reasons Operation Eastern Exit went so well . + Hours after Ambassador Bishop 's visit to Central Command , Iraq invaded Kuwait . In 1979 , the US negotiated access to an airport and port in both Mogadishu and Berbera ; because of limited access the US had to locations in the Persian Gulf area , maintaining this access was a main interest for the Mogadishu embassy to pursue as the US mobilized to intervene in Kuwait . + An increasing level of criminal violence prompted Ambassador Bishop to request the voluntary evacuation of dependents ( e.g. children and spouses of staff ) and non @-@ essential staff in early December , although fighting between the government and the United Somali Congress ( a rebel militia ) remained no less than about 100 miles ( 160 km ) away . The voluntary evacuation later became a mandatory evacuation . By 19 December , the number of official US personnel in the city was reduced from 147 to 37 ; around the same time , fighting between the government and rebels came within about 40 miles ( 64 km ) of Mogadishu . + + = = = Collapse of the Barre government = = = + + On 30 December , violence escalated " an order of magnitude " as militants entered Mogadishu , which was quickly enveloped by a general state of lawlessness . On 30 – 31 December , diplomats , including many stationed in offices elsewhere in the city , were collected and housed in the embassy compound , except two volunteers who remained in the embassy 's K @-@ 7 residential apartments located across Afgoy Road from the embassy . The volunteers in the K @-@ 7 building would be needed as look @-@ outs for the embassy compound 's main gate . On the morning of 31 December , the defense attaché was nearly killed when his vehicle was sprayed with bullets and that evening , a soldier at a roadblock shot the tires of a vehicle carrying another defense official . Attempts by the US and other nations ' diplomats , in particular the Italian embassy , to negotiate a ceasefire for foreigners to leave were unsuccessful . Afgoy Road became a " shooting gallery , " preventing those in safe @-@ havens outside the embassy from reaching it . On New Year 's Day , the first American civilians began to seek refuge at the embassy . + Ambassador Bishop requested an evacuation of the American community on 1 January , indicating that the evacuation could be with the planned Italian , French , or German evacuation efforts , but preferred an evacuation by the US military . The State Department authorized the evacuation on 2 January and on that day , Ambassador Bishop specifically requested an evacuation by the US military , thereby initiating Operation Eastern Exit . Ambassador Bishop had spent a considerable amount of time discussing contingency plans for evacuation with other diplomatic posts . Ultimately , ten heads of missions — eight ambassadors and two chargés d 'affaires — along with their staff sought refuge in the US embassy compound and were evacuated . + + = = Plans , mobilization , and escalating violence = = + + Ambassador Bishop had visited Central Command in August 1990 , where he worked with military experts to update the embassy 's E & E plan . The first notice that an evacuation of the Mogadishu embassy would be needed came on the morning of 1 January , when the top naval commander at Central Command sent a message to his naval operations staff : " Better have Amphib crowd take a look at a helo NEO of Mogadishu ! time / distance to get there from Masirah OP area . " Following the ambassador 's 2 January evacuation request , the commander of Central Command ordered Air Force aircraft to the region , the movement of amphibious ships to Mogadishu , and requested United States Special Operations Command to prepare for a noncombatant evacuation operation . + The initial plan was to evacuate via Mogadishu International Airport . Soon after the evacuation request , the United States Air Force deployed C @-@ 130 transport planes and an AC @-@ 130 , for gunfire support , to Nairobi , Kenya , awaiting clearances to enter Somalia and the ability to safely transfer evacuees from the embassy to the airport . However , the US and other foreign embassies were unable to contact anyone within the government to obtain clearances . It also became apparent that the rebels had an ineffective command @-@ and @-@ control structure , making it impossible to negotiate any ceasefire or guarantee of safe passage . Likewise , government troops faced a command @-@ and @-@ control problem ; reports indicated that army units were separating along clan lines , in some cases soldiers shot officers of a different clan when given orders they disagreed with . Thus , it became clear that safe passage to the airport would not be possible . Several other nations also had aircraft mobilized to reach Mogadishu , but faced the same problems of landing and transit of evacuees to the airport . + On 4 January , several incidents , including a couple exchanges of gunfire , suggested that the embassy 's security detail was insufficient to hold off armed Somalis until the USS Guam and USS Trenton arrived with their helicopters and soldiers , at that time scheduled to arrive on 7 January . The embassy had just six Marine guards , whose job was limited to protecting the chancery . Ambassador Bishop made an urgent request to Washington for two platoons of soldiers to parachute into the embassy to defend it until the ships arrived . The request was denied , but the Ambassador was told that an advance element from the vessels would reach the embassy the following morning . + USS Guam and USS Trenton began transit from the coast of Oman towards Mogadishu at 22 : 30 ( 23 : 30 Oman time ) on 2 January . The commander of Amphibious Group Two had initially proposed a seven @-@ ship Amphibious Task Group , composed of vessels anchored at Masirah Island ( off Oman ) and Dubai and including four amphibious ships so that the full range of amphibious capabilities would be available for the operation . However , intervention in Kuwait seemed imminent and the commander of naval forces at Central Command did not want to divert that many ships from the Persian Gulf , thus the decision to send two of the closest ships . Although the two vessels were selected by mid @-@ afternoon on 2 January , the transfer of some personnel from Dubai to Masirah caused a delay of eight to ten hours . Guam and Trenton carried forces from the 4th Marine Expeditionary Brigade , including a detachment of CH @-@ 53E Super Stallion helicopters — the largest helicopters operated by the US military — and two squadrons of CH @-@ 46 Sea Knight helicopters . + Planning began in earnest as the ships got underway , with a combined command center on Guam . On the morning of 3 January , the task force 's command questioned why they were not given the option of an amphibious landing and requested a tank landing ship be added to the task force ; the request was denied . A warrant officer who had previously served as a Marine Security Guard at the Mogadishu embassy during the mid @-@ 1980s was found . Despite Ambassador Bishop 's planning with Central Command , the task force was provided outdated information . The former MSG told planners that a new embassy had been planned and was under construction several years prior . In fact , the new embassy was located further inland and , after receiving updated information , task force commanders determined that a beach landing , requiring troops to fight their way across the city , was too risky . Initial plans had the ships launch their helicopters at 01 : 00 on 7 January . However , in response to indications from Ambassador Bishop that conditions in Mogadishu were deteriorating , planners considered 1 @,@ 050 @-@ nautical @-@ mile ( 1 @,@ 940 km ; 1 @,@ 210 mi ) and , later , 890 @-@ nautical @-@ mile ( 1 @,@ 650 km ; 1 @,@ 020 mi ) flights with the CH @-@ 53Es while the ships were still located in the northern Arabian Sea . The situation in Mogadishu stabilized somewhat and the mission was delayed until 5 January . + + = = Evacuation = = + + On the evening of 4 January , the final execute order was issued for a 02 : 45 launch of two CH @-@ 53E Super Stallions to arrive at the embassy at dawn . The 60 soldiers selected for the security detail were issued weapons and ammunition . Two Marine Corps KC @-@ 130 refueling tankers were mobilized closer to the operation , from Bahrain to Oman , to refuel the helicopters en route to Mogadishu and the two helicopters transferred from Trenton to Guam . + + = = = Security detail and first evacuees = = = + + Two CH @-@ 53E Super Stallions carrying a 60 @-@ man security detail — 51 Marines and nine Navy SEALs — departed Guam at 02 : 47 , 466 nautical miles ( 863 km ; 536 mi ) from the embassy , and were expected to arrive at 06 : 20 . They performed two aerial refuelings . During the first refueling , a pipe burst on one of the helicopters , dousing soldiers in fuel and nearly forcing a return to the Guam ; problems with the helicopters ' navigation system also complicated the refueling rendezvous . The helicopters arrived in Mogadishu at dawn , crossing the coast just south of the harbor at 25 – 50 feet ( 7 @.@ 6 – 15 @.@ 2 m ) in altitude on a route that was planned to avoid areas of more intense violence reported in the northern parts of the city . On their arrival in Mogadishu , the crew of the helicopters were using an outdated 1969 map , which showed the embassy in an isolated area . Furthermore , they had been told the embassy could be discerned by its white stucco perimeter wall and golf course . The embassy was , in fact , surrounded by new development and the crew saw white stucco walls around many buildings in the city . The helicopters were flying too low to spot a strobe light which was placed on the embassy 's water tower ( the highest point within the embassy compound ) and the golf course in the embassy compound had a black , oil @-@ coated surface — not the familiar green grass that the helicopter crew would recognize.After breaking radio silence ( their only direct communication with the embassy was unencrypted ) to contact the embassy , they were able to discern it and land at 07 : 10 . As they arrived , a group of about 100 to 150 Somalis were attempting to enter the embassy compound via ladders on the wall , but scattered as the helicopters arrived . + The security detail moved to establish a perimeter around the embassy compound and the Air Force 's AC @-@ 130 arrived to provide overhead support . Ambassador Bishop gave the security detail clear instructions on the rules of engagement : they could only use deadly force if people came over the embassy compound 's walls with obvious hostile intent . He also identified three zones of defense , stating a preference to retreat to the third zone before the use of deadly force : + the entire embassy compound + the Chancery , Joint Administrative Office ( JAO ) building , Marine House , and the helicopter landing zone ( HLZ ) + the chancery and JAO buildings ( the two " safehaven " buildings where the evacuees were held ) + Ambassador Bishop clearly explained his rationale to the security detail , which was to avoid any impression that they were intervening in the violence in Mogadishu . He feared that the embassy would be targeted by organized attacks if any group involved in the clashes got the impression that the US was intervening in the conflict . To this effect , he requested the Voice of America and BBC broadcast announcements that the forces were present only to evacuate the embassy and would not interfere in the conflict . The Marines who had been doused in fuel during the refueling were able to take a shower and wash their clothes . + After an hour on the ground , the helicopters left with the first 61 evacuees , including all American civilians and four heads of mission . Evacuees were provided blankets on one of the flights to remain warm . Complications with the only in @-@ flight refueling on the return nearly prevented refueling , which would have forced the helicopters to divert to the Somali desert and await a rescue . At 9 : 40 , the helicopters arrived on Guam and unloaded the evacuees . + + = = = Embassy during the day = = = + + No threats came upon the embassy during the day , although truckloads of armed Somalis frequently drove by the embassy along Afghoy Road . Only one incident seemed to directly target the embassy . A sniper and a spotter were positioned on the embassy 's water tower ( the highest structure in the compound ) and came under fire ; they were ordered to not return fire and soon thereafter ordered to leave their position on the water tower . + The Office of Military Cooperation , just one and a half blocks from the embassy , required evacuation . Despite its proximity to the embassy , an armed convoy was needed to evacuate persons trapped there by the unrest . A convoy of vehicles with several Marines and SEALs left the embassy at 8 : 47 and returned ten minutes later with 22 persons from the OMC ( four Americans , a Filipino , and 17 Kenyans ) . This was the only excursion outside the embassy by the security detail . Throughout the day , foreign diplomats contacted the embassy desiring to be evacuated ; the US welcomed these requests , but required all of them to find their own transportation to the embassy . + A Somali officer who had a previous relationship with the embassy , Major Siad , agreed to travel to rescue the German chargé d 'affaires and British ambassador ( junior staff from the British embassy had previously come to the US embassy ) . The Soviet Union was unable to land a plane in Mogadishu the previous day and the Soviet ambassador asked Ambassador Bishop if he and his staff could be rescued ; Ambassador Bishop , a tennis partner of his Soviet counterpart , agreed but only if they found their own way to the embassy . Seeing the helicopters on the morning of 5 January , they realized the Americans would not remain in the city much longer . At the request of Ambassador Bishop , Major Siad agreed to transport the Soviets , but only if he was paid enough ; the US embassy paid Major Siad , who returned with the Soviet ambassador and 38 of his staff . The brother of President Barre , who was also a Major General and Chief of Police , showed up at the embassy in the afternoon with 25 members of his family requesting to be evacuated , but was turned away after a vocal conversation with the ambassador . + The operation did not include soldiers to handle the evacuation control center ( ECC ) , which was set up in the JAO . A 44 @-@ person force consisting primarily of soldiers to handle the ECC was planned for insertion with the CH @-@ 53E Super Stallions after they had returned to the Guam . However , this was cancelled over objections from the commander of the security detail . The deficit was partially handled by embassy staff who assisted a few soldiers from the security detail . The evacuees were grouped into 15 @-@ person " sticks " to be loaded onto the helicopters and were limited to one piece of luggage apiece . Some attempted to bring more , resulting in problems coordinating their evacuation . Furthermore , many evacuees had pets they wanted to bring , which were not allowed . Most pets were killed by their owners ; some were given poison . Meanwhile , the soldiers were allowed to consume anything they wanted from the embassy 's commissary , such as candy , sodas , and souvenirs ( most had been stationed on ships for several months ) . They were also allowed use or take anything they needed from the embassy ; the medic filled several bags with medical supplies to return to the ship . + As evening approached , work began to prepare the HLZ for the main evacuation . The area was used as a parking lot and several vehicles were left without keys by staff that had already been evacuated . Some cars had to be broken into to be moved . Chemical lights were placed in the HLZ in a NATO " Y " pattern . The entire mission would be conducted with night vision goggles , which required all lights in the embassy compound to be turned off . + + = = = Main evacuation = = = + + The main evacuation occurred in the early morning hours of 6 January and consisted of four waves of five CH @-@ 46 helicopters . The timing of this phase was determined by range of the CH @-@ 46 Sea Knight , which lack aerial refueling capability ; the ships were about 350 – 380 nautical miles ( 650 – 700 km ; 400 – 440 mi ) away during this phase . An AC @-@ 130 was sent from Saudi Arabia to provide gunfire support during the evacuation and two UH @-@ 1 Iroquois helicopters were on standby to provide gunfire support , but were not deployed . + The first wave departed Guam at 23 : 43 . As the second wave landed , Major Siad arrived at the embassy gate accompanied by two truckloads of soldiers and held a grenade in one hand and a radio in the other . His request to speak with the ambassador was granted . Major Siad demanded that the evacuation cease immediately because the Somali government had not granted the US permission to carry out such a military operation . He claimed that he would radio soldiers to shoot down the helicopters if the operation continued . The second and third waves were able to depart without incident as the ambassador negotiated with the Major , who finally agreed to settle the matter for several thousand dollars in cash and keys to the ambassador 's armored car . Ambassador Bishop remained engaged in conversation with the Major until he reached the helicopter landing zone to depart with the final wave to prevent the Major from reneging on the deal . The final wave departed the embassy at 1 : 49 and landed on Guam at 2 : 23 ; twenty minutes later , Ambassador Bishop declared the evacuation complete . + + = = = Aftermath at the embassy = = = + + Armed looters were observed entering the embassy compound as the final wave departed . The doors of the chancery — the main building of the embassy — were reportedly blown open by RPGs within two hours of the embassy 's evacuation . Somali employees of the embassy — known as foreign service nationals ( FSNs ) — could not be evacuated . Ambassador Bishop tried unsuccessfully to have these employees airlifted to safer parts of Somalia . Many of the FSNs had sought refuge in the embassy with their families and about 30 were hired as guards and protected the embassy throughout the ordeal . Local banks had been closed for some time and the embassy was unable to pay the FSNs . The Ambassador left the FSNs with keys to the commissary and warehouse on the embassy compound and they were permitted to take anything they needed . + + = = = Return to Oman = = = + + A total of 281 evacuees were taken from the embassy , including 12 heads of missions ( eight ambassadors and four chargés d 'affaires ) and 61 Americans ( including Ambassador Bishop and 36 embassy staff ) . The heads of mission were the ambassadors of the United States , Kenya , Nigeria , Soviet Union , Sudan , Turkey , United Arab Emirates , and United Kingdom and the chargés of the embassies of Germany , Kuwait , Oman , and Qatar . + Rather than disembark in nearby Mombasa , as originally thought by the evacuees , the ships were ordered back to Oman — a five @-@ day journey . The sailors and marines made way for the evacuees to share living quarters . When the chaplain of Guam asked crew to sign up as guides for the evacuees while aboard the vessel , two hundred signed up within an hour , and some of the sailors even dressed up as clowns to ease the ordeal for children . At the request of the ambassadors , a formal session with the ships ' senior officers was held to express their thanks . On 11 January , the evacuees were offloaded at Muscat , Oman . That afternoon , the American evacuees were flown to Frankfurt , Germany , from where they continued home . + + + = 2010 Claxton Shield = + + The 2010 Claxton Shield was the 57th Claxton Shield tournament , the premier baseball competition in Australia , and was held from 6 November 2009 to 7 February 2010 . It was hailed as the precursor to the new Australian Baseball League that will start in the place of the Claxton Shield in late 2010 to early 2011 . The Victoria Aces defeated South Australia two games to nil in the championship series to win the tournament ; this was the 22nd time the Claxton Shield had been awarded to a Victorian team . The competition was sponsored by Domino 's Pizza . + At the conclusion of the regular season , the Victoria Aces finished in first place with a 17 – 7 record , earning home @-@ field advantage for the three @-@ game championship series . South Australia hosted the three @-@ game semi @-@ final series against the New South Wales Patriots . Both teams finished with a 14 – 10 record . The Perth Heat ( 12 – 12 ) and Queensland Rams ( 3 – 21 ) both failed to qualify for the finals . + + = = Overview = = + + In June 2009 , it was announced that the rights to the Claxton Shield had been sold to a new Australian Baseball League ( ABL ) , with ownership split between Major League Baseball 's 75 percent share and the 25 percent share owned by the Australian Baseball Federation . The 2010 tournament was considered preparation for the inaugural ABL season starting in 2010 – 11 . It varied from the 2009 Claxton Shield by expanding the season to include ten rounds . Since an uneven number ( five ) teams were involved , four teams paired off for each round and played a three @-@ game series , while the remaining team took a bye . During the season , each team had two bye rounds and played two rounds against each other team , one at home and one away . In total , the schedule allowed for 24 regular @-@ season games per team before a postseason similar to the 2009 edition : the first @-@ place team directly qualified for the championship series and played against the winner of a playoff series between the second- and third @-@ place teams . + During the regular season , games were played on a Friday night and a doubleheader on Saturday ; in each doubleheader one of the two games was shortened to seven innings . The exception to this was when Perth played their home games ; they played on a Thursday night instead of a doubleheader on Saturday . Each postseason series was scheduled for a Friday , Saturday and Sunday . + + = = Teams = = + + + = = = Rosters = = = + + The 2010 series allowed each team to make use of a 19 @-@ man active roster . Exceptions were made in two cases that allowed teams ' active rosters to expand to 21 players , both times for the same reason . Two games during the season had to be postponed because of poor weather . Both games involved teams meeting for the first time during the season ; make @-@ up games were scheduled at the start of the return series between the teams , and this resulted in two four @-@ game series . In both cases , the teams had a 19 @-@ man roster for the make @-@ up game , and an expanded 21 @-@ man roster for the originally scheduled series . + + = = = Venues = = = + + The 2010 Claxton Shield was contested between five teams from around Australia . In previous years , many of the teams had played their home games at multiple venues . This season each team held their home games at only one venue . There was one scheduled exception to this at the start of the season : the New South Wales Patriots ' final home series against the Perth Heat was held at Gilchrist Oval , whereas all of their other home games were held at Blacktown Baseball Stadium . + As a result of poor attendance at Geelong Baseball Park , game one of the fifth @-@ round series between New South Wales and the Victoria Aces was moved to La Trobe University , Melbourne . Although the Geelong games had attracted crowds of no more than 500 , the moved game had an attendance of 2 @,@ 200 . Though no further regular season games were moved , the finals series hosted by the Aces was held at La Trobe University as well . + The venues are as follows : + + = = Regular season = = + + † — A game postponed from Round 7 , held in Round 8 , was played with Victoria Aces as the away team and Queensland Rams as the home team , despite being played at Geelong Baseball Park , Geelong , Victoria . + ‡ — A game postponed from Round 3 , held in Round 9 , was played with South Australia as the away team and Victoria Aces as the home team , despite being played at Norwood Oval , Adelaide , South Australia . + The Queensland Rams were the first team to be eliminated from contention for the finals , after being swept four games to nil by the Victoria Aces in round 8 . The following round saw South Australia clinch a position in the finals , despite finishing the round in second position . It was not until the final round that the last two spots in the finals were decided : the Aces clinched top spot by sweeping the Perth Heat , which combined with the New South Wales Patriots sweep of the Rams eliminated Perth from contention and secured the last finals spot for the Patriots . + + = = = Statistical leaders = = = + + + = = Finals series = = + + The 2010 Claxton Shield made use of the same finals structure as had been used in the 2009 season . The top three teams at the conclusion of the ten rounds of regular @-@ season games qualified . The second- and third @-@ place teams faced in each other in a best @-@ of @-@ three series hosted by the second @-@ place team . The winner of that series then faced the first @-@ place team for a best @-@ of @-@ three series . South Australia hosted the New South Wales Patriots at Norwood Oval , Adelaide , while the Victoria Aces hosted the championship series at La Trobe University , Melbourne . In the finals , the home team and away team alternated during each of the series . As a result , South Australia was officially the away team for game two of its series against New South Wales , as was Victoria in the championship series . + After defeating the Patriots two games to one in the semi @-@ final series , South Australia progressed to the championship series against the Aces . There they were defeated two games to nil . After game two of the championship series , Victoria 's Matthew Blackmore was named both Claxton Shield Final Series MVP and Pitcher of the Year . + + = = = Semi @-@ final series = = = + + + = = = Championship series = = = + + + = = Awards = = + + At the conclusion of the finals series , the winner of two awards were announced . Matthew Blackmore won both the Pitcher of the Year award and the Finals Series MVP award . At the Baseball Australia Diamond Awards , held on 6 March at the Hotel Grand Chancellor , Adelaide , Wayne Lundgren was announced as the 35th winner of the Helms Award ; the Claxton Shield 's Most Valuable Player award . Lundgren was the first pitcher to win since 1986 . Runners @-@ up by two votes were Paul Mildren and Michael Collins . + + + = Independiente ( Ricardo Arjona album ) = + + Independiente is the thirteenth Spanish @-@ language studio album by Guatemalan singer @-@ songwriter Ricardo Arjona , released on 23 September 2011 . Recorded in the United States and Mexico , it was produced by Arjona with Dan Warner , Lee Levin and Puerto Rican singer @-@ songwriter Tommy Torres . The album — the first independent release by Arjona after he was signed by Sony Music in 1993 and Warner Music in 2008 — was issued by his own label , Metamorfosis . + Composed and written in a year , the record marks Arjona and Torres ' fourth collaboration . For Independiente , Arjona returns to his trademark sound after his stylistic departure for Poquita Ropa ( 2010 ) . While producing the latter , he had used fewer instruments to simplify his sound , having introduced what had been called a " stripped @-@ down acoustic effort " in his music . Independiente has been compared to his earlier recordings , Historias ( 1994 ) and Animal Nocturno ( 1993 ) . + Independiente became Arjona 's fourth number @-@ one album on the Billboard Top Latin Albums where it debuted for the week ending 22 October 2011 . For thirteen non @-@ consecutive weeks it topped the Latin Pop Albums chart , and reached number one on the Mexican Albums Chart . It is his fifth consecutive album to chart on the Billboard 200 ( reaching number sixty @-@ five ) , and his fourth album to chart in Spain ( peaking at number sixty @-@ eight ) . Within one week after its release Independiente was certified gold in Chile , the United States and Mexico and certified platinum in Venezuela and Argentina . + Five singles have been released from the album . The lead single , " El Amor " , became a commercial success in several Latin American countries and was number one on the Billboard Latin Songs and Latin Pop Songs charts . It was followed by " Fuiste Tú " ( featuring Gaby Moreno ) , which reached number one on the Latin Pop Songs , number two on the Latin Songs charts and topped several other national charts . " Mi Novia Se Me Está Poniendo Vieja " was released in May 2012 ; " Te Quiero " in July 2012 , and " Si Tu No Existieras " in November 2012 . To promote Independiente , Arjona embarked on his Metamorfosis World Tour . + + = = Background = = + + In 2010 , Arjona wanted to change his musical style ; after experimenting with using as few instruments as possible , he obtained a sound similar to an a capella performance ( simplifying his sound ) and introduced what he called a " stripped @-@ down acoustic effort " to his music . This was heard on his twelfth studio album , Poquita Ropa . Arjona produced the album with Dan Warner , who has worked with Shakira , Celine Dion and Christina Aguilera . When promoting the album Arjona said , " [ songs ] are like women ; they get things up and are so concerned about this that they forget that the less clothes , more beauty . The songs are often overwhelmed by ourselves , because we saturate them with arrangements looking to exalt their qualities and we end up hiding them " . Poquita Ropa became the first album since Adentro which Arjona recorded without Torres . + Weeks before the release of Independiente , Arjona issued a letter raising the issue of his past relationships with recording companies . He revealed the circumstances of his first contract : " a producer , friend of mine , told them [ the record label ] that if they did not sign me in , they won 't sign two artists he had [ at that time ] " . Arjona further explained that he received the " minimum royalty percentage " from his most successful albums . Independiente is Arjona 's first independent release through his own label : Metamorfosis , a company he created to refocus his career . The company is presided by Arjona and several friends ( including photographer @-@ director Ricardo Calderón , Universal Music México executive Humberto Calderon and BMG 's Miriam Sommerz ) , and is based in Miami and Mexico City . Arjona commented that his independence represented compromise more than freedom , stating that " Inside the word ' Independent ' , even when it sounds like extreme freedom , there 's a big amount of compromise and the responsibility of being able to administrate , in the best way possible , such independence " . Billboard notes that , although other groups have released independent albums following contracts with major labels , Arjona is the most important Latin pop artist to do so . Although the album is marketed within the new label , distribution was handled by Warner Music . + + = = Production and recording = = + + Independiente marked Arjona 's fourth collaboration with Torres . The latter was a composer and producer , also receiving background @-@ vocal credit . The musicians first worked together in 2005 , when Arjona released his tenth studio album ( Adentro ) . He stated that he first " tested " Torres by sending him the " hookiest and darkest tracks " on the album : " Acompañame A Estar Solo " and " Iluso " . Torres then " went all out on the first demo , hiring a full band that included a string orchestra " . In Quién Dijo Ayer ( 2007 ) , Torres produced the singles " Quién " and " Quiero " and provided background vocals on the remastered versions of Arjona 's past hits . In 5to Piso ( 2008 ) , Torres produced several tracks ; one was the lead single " Como Duele " , considered Arjona 's " biggest hit in years " by Jason Birchmeier of Allmusic . + The album was composed over a one @-@ year period . Most of its production was handled by three producers familiar with Arjona 's work : Dan Warner , Lee Levin and Dan Rudin . Tommy Torres also produced three tracks : the lead single " El Amor " , second single " Fuiste Tú " and " Hay Amores " . Victor Patrón produced two songs , ( " Caudillo " and the piano version of " Mi Novia Se Me Está Poniendo Vieja " ) and Julio Chávez aided in the production of " Reconciliación " . Arjona wrote all the songs except " El Amor " ( which was co @-@ written with Torres ) . The album was recorded and produced in several cities in the United States and Mexico . Independiente was mixed at the Blue Grotto in Nashville , Tennessee , and mastered by Tom Coyne and Aya Merrill at Sterling Sound in New York City . With Torres ' return to producing Arjona regained the classic , trademark sound which Torres helped develop since 2005 . + + = = Composition = = + + Independiente opens with " Lo Que Está Bien Está Mal " , a Latin pop song and the only track composed by Dan Warner instead of Arjona ( who wrote the lyrics ) . " El Amor " was motivated by Arjona 's desire to examine " those big , dark events within love that nobody talks about " ; he continued , " [ the ] dark sides of love are extremely fundamental to understand its great value . " Arjona added , " So many good things abould love has been shown that somebody had to turn it around and tell the bad ones " . In a February 2012 interview , Arjona stated that " El Amor " was the " most tawdry " song he had released to date , explaining that their choice of the song was a " contradiction " because it was not " the song which could better represent the entire album " . He described it as " very strong " and " a bit dark " . The single marked Arjona 's return to his signature , mainstream sound after the Cuban music influenced Poquita Ropa 's lead single " Puente " , a mixture of salsa and merengue which failed to make an impact in the United States . + The album includes " Fuiste Tú " , a duet with Guatemalan singer Gaby Moreno . Its instrumentation consists of piano , violin , guitars , drums and other percussion . Although Arjona stated that he " had the possibilities to record this song with very well known people " he expressed his happiness with Moreno , revealing that " the possibilities of doing it with her , for me , are a celebration " . He described Moreno as " incredibly talented " , a " countrywoman " and a " fantastic human being " . Arjona named " Fuiste Tú " as one of the most important songs on the album . " Mi Novia Se Me Está Poniendo Vieja " took two years to complete and Arjona dedicated it to his mother , Noemí Morales . He stated that he wrote it " as a gift for my mom on Mother 's Day " and that he thought " the idea of including it on the album was very good " . As with his single " Señora De Las Cuatro Decadas " ( on 1994 's Historias ) , at first he never thought to include the song on an album . " Caudillo " evokes " the image of some friends " Arjona had at college ; he asserted that he " appears constantly there because sometimes we transform ourselves into a contradiction of all those things we fought in those moments . It 's the history of a student leader that becomes a president " . Arjona dedicated the album to his father , who died in 2011 . + + = = Release and promotion = = + + Independiente was first digitally released in some South American countries on 23 September 2011 as a special edition , dubbed the Cono Sur Edition . This version included a different mix of " Reconciliación " . On 30 September , the digital download for the standard edition of the album was released in several Latin American and European countries . On 4 October , the album was officially released as a digital download and compact disc in most of these same markets as well as North America ; an iTunes edition was released as a digital download on the iTunes music store . This version included an album @-@ only video , entitled " Independiente " . In Germany , the album was first available on the Kiwi label on 4 October and on 11 October through Warner Music . In Canada and Spain , the compact @-@ disc version of the album was available on 25 October . + Arjona appeared on a television special in 2011 to promote Independiente . The special featured guest appearances by Gaby Moreno , Ricky Muñoz ( of the Mexican band Intocable ) and Paquita la del Barrio . Broadcast by Televisa , the program showcased the fourteen songs on Independiente . Muñoz said that he was " happy to do things for Ricardo [ Arjona ] " , elaborating that they met each other " some time ago " and it was " a very special situation " . The show was later rebroadcast on 5 November by Canal de las Estrellas . + + = = Singles = = + + The first single from Independiente is " El Amor " , released on 23 August 2011 . In the United States it reached number one on the Billboard Top Latin Songs chart ( Arjona 's fourth number one on that chart , following " Desnuda " , " Cuando " and " El Problema " ) and number one on the Billboard Latin Pop Songs chart . It was also a hit in Latin America , reaching number one in Argentina , Mexico , Colombia , Venezuela , Chile , Costa Rica , Panama and Guatemala . The music video for " El Amor " , filmed in black @-@ and @-@ white , was released on 8 September 2011 . It was directed by Ricardo Calderón ( who also directed Arjona 's music video for " Como Duele " ) and filmed in Mexico City . The second single from the album is " Fuiste Tú " , a duet with Guatemalan singer Gaby Moreno . The music video for the song was shot in Guatemala ( around the tropical areas of Antigua Guatemala , Río Dulce , the Atitlán lake , Semuc Champey and the Tikal ruins ) and directed by Argentine director Joaquín Cambre . Arjona commented that " this video recreates the battle on the couple when someone starts to talk ' is the beginning of the end ' " . " Fuiste Tú " reached number two on the Billboard Top Latin Songs and number one on the Latin Pop Songs charts . + " Mi Novia Se Me Está Poniendo Vieja " was released as the third single . Arjona wrote the song for his mother , Noemí Morales . The music video for the song , released in April 2012 , was filmed at Universal Studios in Los Angeles . It features Arjona and his son , Ricardo Arjona Jr . , and was directed by Robert García . The song was used by American telecommunications corporation AT & T for a Nokia Lumia 900 smartphone commercial featuring Arjona and was released in music stores in May 2012 . The fourth single from the album , " Te Quiero " , was released in July 2012 . The music video for the song was filmed during Arjona 's concerts at Vélez Stadium in Buenos Aires , Argentina during his Metamorfosis World Tour . It marks Arjona 's first music video taken from a live performance . The song reached number ten in Mexico and number one on both the Billboard Latin Songs and Latin Pop Songs chart . " Si Tu No Existieras " was released in November 2012 as the set 's fifth and final single , and was intended to promote the re @-@ release of the album . The song , which music video was similar to that of " Te Quiero " , managed to peak at number 14 in Mexico . + + = = Tour = = + + Beginning on 27 January 2012 in Toluca , Mexico , Arjona embarked on a world tour to promote the album . The Metamorfosis World Tour was announced in December 2011 , and visited the Americas . The show consisted of four theatrical sets on a revolving stage ; Arjona performed on each in turn , as it relateed to each song . Fellow Guatemalan singer @-@ songwriter Gaby Moreno appeared in several performances , joining Arjona for " Fuiste Tú " . The tour was praised by critics and fans . Natalie Torres of Dia a Dia reported , " Arjona knows how to handle his ' girls ' , with a mix of attitudes from a ' rough ' male and seductive lyrics " . + Jon Pareles of The New York Times commented that " Arjona is one of Latin pop ’ s finest lyricists : observant , nuanced , sometimes wry , sometimes melancholy and especially fond of the play of opposites " . He added , " unlike some of his fellow Latin pop stars , Mr. Arjona is no saccharine lover boy " . The tour broke records for ticket sales , commercial gross and attendance . In Buenos Aires it was the most popular show at Velez Stadium , with a total attendance of more than 160 @,@ 000 for four consecutive sold @-@ out concerts . In Guatemala City Arjona was the first artist with two consecutive sold @-@ out concerts at Mateo Flores Stadium , with a combined attendance of more than 50 @,@ 000 . As of October 2012 , the tour has been performed for close to one million people in more than eight countries . + + = = Commercial performance = = + + Independiente debuted at the top of the Billboard Top Latin Albums for the week ending 22 October 2011 , and remained at that position the following week . It was the third album by Arjona to remain for more than a week at number one , after Galería Caribe ( 2000 ) and 5to Piso . Independiente became his fourth chart @-@ topper , following Poquita Ropa ( 2010 ) . For its third week it fell to number two , replaced by Chino & Nacho 's Supremo . The album also debuted at number one on the Latin Pop Albums chart for the week ending 22 October , becoming Arjona 's fifth album to do so . It remained at number one the following week ; for its third week , it was replaced by Supremo . The album reached number one again for the week ending 12 November , and later for the week ending 11 February 2012 . For its second run it remained three weeks at the top before being replaced by Maná 's Drama y Luz for a week ; for its third run at number one , it remained at the top spot for five weeks . For the week ending 2 June 2012 , Independiente returned again to number one . + For the week it debuted atop both the Latin Albums and Latin Pop Albums charts , Independiente also appeared as number 65 on the Billboard 200 . It is Arjona 's fifth consecutive album to chart on that list ( following Adentro , Quién Dijo Ayer , 5to Piso and Poquita Ropa ) , although it has only charted higher than Adentro . In Mexico , Independiente debuted at number one for the week ending 9 October 2011 . The following week it fell to number two , replaced at the top by Espinoza Paz ' Canciones Que Duelen . For its third week , the album fell to number three . In Argentina , Independiente debuted at number one for the week ending 9 October 2011 ; it remained at the top position for a single week , dropping to number five the following week . The album also charted on Spain , reaching number 76 . The following week it fell off the chart but later re @-@ entered , reaching its peak at number 68 . Independiente is Arjona 's fourth album to chart in Spain , following Adentro , 5to Piso and Poquita Ropa . On the 2011 year @-@ end charts , Independiente was the 50th best @-@ selling album on the Latin Albums chart and the 15th best @-@ seller on the Latin Pop Albums chart . In Mexico , it was the 19th best @-@ selling album of 2011 . + Independiente was certified platinum by the Argentine Chamber of Phonograms and Videograms Producers in recognition of 40 @,@ 000 copies sold . It was also certified gold and platinum by the Mexican Association of Producers of Phonograms and Videograms for 90 @,@ 000 copies shipped . In the United States , Independiente was certified Latin platinum by the Recording Industry Association of America for 100 @,@ 000 copies shipped . In Venezuela , the album was certified double platinum for more than 40 @,@ 000 copies sold . It was certified gold in Chile for 5 @,@ 000 copies shipped , and in Colombia for 10 @,@ 000 copies sold . As of November 2012 , Independiente has sold 75 @,@ 000 copies in the United States . + + = = Critical reaction and awards = = + + David Jeffries of Allmusic gave the album a mildly positive review , citing Arjona 's return to his more mainstream style after the " stripped @-@ down acoustic effort " Poquita Ropa . He compared ( as did Arjona and other critics ) the production values and musical style of Independiente with past albums Animal Nocturno and Historias . Finally , he stated that " Returning fans will revel in this combination of freedom and growth , and appreciate the return of producer Tommy Torres , the man who has been behind the boards for quite a few of Arjona 's most popular releases " ( referring to Torres ' absence from the production of Poquita Ropa ) . + A contributor to the Colombian website CMI commented that " listening to Independiente is a labyrinth to go through , each song is a huge path that seems to have no end , because it involves imagination , it invites you to dream , to charm , to bewitch . But neither leaves behind the problematic requirements of love , its loopholes , hideouts and concerns , as well as its bad times in this joke that 's life " . Independiente was nominated at the Premios Juventud of 2012 for the " Lo Toco Todo ( I Play It All ) " award . On 25 September 2012 , the album received two nominations at the 13th Annual Latin Grammy Awards : Album of the Year and Best Singer @-@ songwriter album . On 3 December 2012 , Independiente received a nomination for " Pop Album of the Year " at the 2013 Premios Lo Nuestro awards . It also received a nomination for Grammy Award for Best Latin Pop Album at the Grammy Awards of 2013 . Arjona does not win the latter award since 2005 with Adentro in a shared win with Mexican singer Julieta Venegas . In February 2013 , Independiente received a nomination for " Latin Pop Album of the Year " at the Billboard Latin Music Awards of 2013 . + + = = Track listing = = + + All songs written and composed by Ricardo Arjona , except where noted . + + = = Personnel = = + + Credits are taken from Independiente liner notes . + + = = Chart performance = = + + + = = Certifications = = + + + = = Release history = = + + + + = 2003 Pacific typhoon season = + + The 2003 Pacific typhoon season was a slightly below average yearlong period of tropical cyclogenesis exhibiting the development of 31 tropical depressions , of which 21 became named storms ; of those , 14 became typhoons . Though every month with the exception of February and March featured tropical activity , most storms developed from May through October . During the season , tropical cyclones affected the Philippines , Japan , China , the Korean Peninsula , Indochina , and various islands in the western Pacific . + The season ran year @-@ round , with the first storm , Yanyan , developing west of the Marshall Islands on January 15 . In April , Typhoon Kujira became one of the longest @-@ lasting Pacific typhoons in history and attained climatological records for its unusually early impacts . Typhoon Imbudo in July caused several deaths and extensive damage across the Philippines and China . In September , Typhoon Maemi became one of the costliest typhoons in recorded history after striking South Korea ; Maemi was also the most intense tropical cyclone of the season with a minimum barometric pressure of 910 mbar ( hPa ; 26 @.@ 87 inHg ) . In late November , Typhoon Lupit devastated areas of Yap State in the Federated States of Micronesia . The season closed with the dissipation of a tropical depression east of the Philippines on December 27 . + The scope of this article is limited to the Pacific Ocean , north of the equator and west of the International Date Line . Storms that form east of the date line and north of the equator are called hurricanes ; see 2003 Pacific hurricane season . Tropical Storms formed in the entire west Pacific basin are assigned a name by the Tokyo Typhoon Center . Tropical depressions in this basin monitored by the Joint Typhoon Warning Center ( JTWC ) have the " W " suffix added to their number . Tropical depressions that enter or form in the Philippine area of responsibility are assigned a name by the Philippine Atmospheric , Geophysical and Astronomical Services Administration or PAGASA . This can often result in the same storm having two names . + + = = Seasonal forecasts = = + + On March 5 , 2003 , meteorologists from the University College London at the Tropical Storm Risk ( TSR ) Consortium issued an extended range forecast for the typhoon season , noting the likelihood of near average tropical cyclone activity as a result of projected neutral sea surface temperatures . The forecast indicated the potential for 26 @.@ 2 tropical storms , compared to the 10 – and 30 @-@ year average of 27 @.@ 8 and 26 @.@ 3 storms , respectively . The following month , the group raised their forecast for tropical storms to 26 @.@ 7 , indicating a slightly above average season . Over next two months , however , fluctuations in sea surface temperatures , particularly those in the Central Pacific , caused the group to revise their predictions downward and indicated the probability for a slightly below average typhoon season in their June forecast . A rise in sea surface temperatures in the following months prompted the forecasting group to once again raise their forecasts to indicate a near @-@ average season in their final August forecast update , which predicted 27 tropical storms . The group was very accurate in their forecasts , with their April and August forecasts being the most accurate . + Similarly , meteorologists working with the City University of Hong Kong issued a seasonal projection on April 24 , 2003 , indicating the likelihood of a normal or below normal season with 29 total tropical cyclones , 26 tropical storms , and 16 typhoons . As with the TSR , the group primarily based their forecast numbers on the prevailing status of the El Niño @-@ Southern Oscillation . The City University of Hong Kong revised their forecasts on June 24 , 2003 , indicating a slight increase of total tropical cyclones to 30 . The group was also accurate in their forecasts for the entirety of the Northwest Pacific , though their specialized forecasts for the South China Sea were substantially off . + During the year , the Japan Meteorological Agency ( JMA ) issued advisories on tropical cyclones west of the International Date Line to the Malay Peninsula , and north of the equator ; this was due to the agency 's status as the official Regional Specialized Meteorological Center , as designated by the World Meteorological Organization in 1989 . The JMA issued forecasts and analyses four times a day , beginning at 0000 UTC and continuing every six hours . The JMA issued forecasts based on a climatological tropical cyclone forecast model . The agency estimated 10 minute sustained winds and barometric pressure based on the Dvorak technique and numerical weather prediction . The JTWC also issued warnings on storms within the basin , operating from Pearl Harbor in Hawaii to represent the interests of the United States Armed Forces in the Indian and Pacific Oceans . + + = = Season summary = = + + Throughout the season , sea surface temperatures within the western equatorial Pacific were above normal , including those in the South China Sea . Areas of convection persisted year @-@ round in the lower latitudes , particularly around the Philippines . Atmospheric divergence was also prevalent in the same regions , resulting in enhanced tropical cyclogenesis east of the Philippines in 2003 ; the mean region of development of tropical systems during the year was more southwest than the 1971 – 2000 30 @-@ year average . In 2003 , the JMA monitored 21 tropical cyclones of at least tropical storm intensity ; of those , 14 reached typhoon intensity . Though the number of tropical storms was below average , the ratio between tropical storms and typhoons was 66 % greater than normal . The Philippine Atmospheric , Geophysical and Astronomical Services Administration ( PAGASA ) monitored three additional cyclones of at least tropical storm intensity that were not monitored by the JMA . + The season began with the formation of Tropical Storm Yanyan on January 15 . After its dissipation five days later , no tropical cyclones of at least tropical storm intensity developed over the next two months . This period of inactivity ended with the formation of Typhoon Kujira in mid @-@ April ; Kujira was one of the longest lived Pacific storms on record and was the first typhoon with 1 @-@ minute sustained winds of at least 240 km / h ( 150 mph ) in April since Typhoon Isa in 1997 . Tropical activity was enhanced from May to June , and during this period the JMA monitored four tropical storms , while the PAGASA monitored a fifth storm off the eastern Philippines . Three of the four tropical storms monitored by the JMA approached or hit Japan , including Typhoon Soudelor , which brought heavy rainfall and wind across the Ryukyu Islands and the Korean Peninsula . + Tropical activity once again declined towards the second half of June and first half of July . The second half of July , however , featured the development of typhoons Imbudo and Koni , which both tracked westward across the Philippines before striking areas near Saipan and other regions of southeastern China . Imbudo caused the deaths of 78 people and US $ 383 million in damage . August was a highly active month for tropical cyclogenesis , with a total of six tropical storms monitored by the JMA , JTWC , and PAGASA . This included typhoons Krovanh and Dujuan , which also struck southeastern China . Typhoon Etau earlier in the month made landfall in Japan , resulting in 17 deaths . + Activity was somewhat below average in September , with only one tropical cyclone making landfall , Maemi . However , Maemi was the strongest tropical cyclone of the season and was the costliest with roughly US $ 4 @.@ 8 billion in damage , mostly in South Korea . Tropical cyclogenesis and activity continued to decline after August , with October featuring only three tropical storms . However , two , Ketsana and Parma , reached typhoon intensity ; both stayed away from land . November featured less storms but was climatologically average , with two typhoons developing . The second typhoon , Lupit , devastated portions of Yap State , resulting in approximately $ 1 @.@ 7 million in damage . In December , the JTWC and PAGASA monitored a sole tropical system east of the Philippines , though the JMA did not monitor or classify any tropical cyclones during the month . + + = = Storms = = + + In storm information below , wind @-@ speed advisories differ from the Joint Typhoon Warning Center ( JTWC ) to the JMA as the JTWC uses the United States criteria of 1 @-@ minute mean to designate maximum sustained winds , while the JMA uses the 10 @-@ minute mean wind criteria to designate tropical cyclone maximum sustained winds . This difference generally results in JTWC maximum winds appearing higher than the maximum winds described by the JMA for the same cyclone . + + = = = Tropical Storm Yanyan = = = + + On January 11 , the JTWC began monitoring the disturbance that would eventually develop into Yanyan near the International Date Line . As the system tracked westward , it gradually moved into a more favorable environment for tropical cyclogenesis . On January 14 , surface observations indicated that the low @-@ pressure area had developed a closed , low @-@ level circulation center indicative of a tropical cyclone , satellite imagery remained inconclusive . Nonetheless , the JMA classified the pressure area as a tropical depression west of the Marshall Islands at 0600 UTC on January 15 . The JTWC would follow suit by classifying the storm as such at 1800 UTC later that day . At the time , the depression was tracking west @-@ northwest under the influence of a subtropical ridge to the north . Over the next day the system waned in convective activity before resuming its previous track and accelerating . At 0000 UTC on January 17 , the JTWC upgraded the system to tropical storm intensity , though the storm remained nameless as the JMA continued to classify it as a tropical depression . + Throughout January 17 the tropical storm would again oscillate in strength , resulting in a brief downgrade by the JTWC to tropical depression intensity . However , an increase in deep convection resulted in its reclassification as a tropical storm at 1800 UTC that day , followed by the JMA upgrading the system to tropical storm intensity at 1200 UTC on January 18 . As such , the storm received the name Yanyan . At roughly the same time , the tropical cyclone began to stall east of the Mariana Islands and curve sharply northeastward . Whilst the JTWC indicated that Yanyan peaked in strength late on January 18 with 1 @-@ minute sustained winds of 60 km / h ( 37 mph ) , the JMA considered the system to have maintained the same intensity throughout its stint as a tropical storm . Steered by the same nearby subtropical ridge , Yanyan would continue to track towards the northeast into a less favorable tropical cyclone environment . The JMA downgraded Yanyan to tropical depression at 1200 UTC on January 20 before the storm transitioned into an extratropical cyclone as its low @-@ level circulation center decoupled from the primary mass of convection due to strong wind shear . At 0000 UTC , both the JTWC and JMA discontinued the monitoring of Yanyan . + + = = = Typhoon Kujira ( Amang ) = = = + + Kujira developed from a broad area of disturbed weather as a tropical depression on April 9 well removed from any landmasses . Shortly after development , Kujira quickly intensified in its early stages , and was upgraded to a tropical storm just two days after cyclogenesis . Strengthening slowed afterwards , though the storm attained typhoon intensity on April 14 . Intensification continued and late on April 15 , Kujira reached its peak intensity with winds of 165 km / h ( 105 mph ) and a minimum barometric pressure of 930 mbar ( hPa ; 27 @.@ 46 inHg ) . Following peak intensity , Kujira would begin to track northwest and oscillate in strength , cresting an additional two times in intensity . On April 21 , the typhoon was downgraded to tropical storm intensity and began to track erratically for several days east of Taiwan . However , on April 24 , Kujira would resume a northward track and begin to weaken , and on April 24 was downgraded to tropical depression strength as it made landfall on Kyushu . Following landfall , Kujira transitioned into an extratropical cyclone on April 25 , which persisted until crossing the International Dateline towards the end of April 2003 . + Shortly after developing , Kujira caused two fatalities in Pohnpei in addition to minor agricultural and infrastructural damage ; similar effects were felt in Guam . Several days later , the typhoon prompted cyclone warnings and other precautionary measures in the Philippines after forecasts indicated the potential for strong winds and rain . However , ultimately any effects in the archipelago associated with Kujira remained minimal . The typhoon also prompted warning products in Taiwan , making it the first April typhoon since 1978 to cause such a feat . Unlike in the Philippines , however , Kujira would bring significant rainfall to Taiwan . Effects from the typhoon were most significant in Japan , particularly in the Ryukyu Islands . Strong winds , rain , and waves caused US $ 230 @,@ 000 ( ¥ 27 @.@ 8 million ) in agricultural damage on Ishigaki Island . One person was killed due to injuries resulting from the waves . In Kyushu , heavy rainfall , peaking at 196 mm ( 7 @.@ 7 in ) in Ōita Prefecture , was reported . Overall , despite its distance away from land and weak intensity at the time of its sole landfall , Kujira resulted in three fatalities . + + = = = Typhoon Chan @-@ hom = = = + + Midday on May 18 , the JTWC began to monitor an area of persistent disturbed weather associated with a broad low @-@ pressure area southwest of Chuuk . Within highly conductive conditions , the disturbance quickly organized and became classified as a tropical depression at 0000 UTC the following day . In its initial stages , the depression tracked slowly northeastwards . However , a shortwave trough forced a weakness in a nearby ridge , allowing for the storm to take a more streamlined , northward path . At 1200 UTC on May 20 , the JMA upgraded the depression to Tropical Storm Chan @-@ hom . Following the system 's naming , Chan @-@ hom temporarily meandered towards the northwest before resuming its northeasterly track . The next day , the storm began to develop an eye ; this was reflected with an upgrade by the JMA to typhoon status at 0600 UTC on May 23 . Gradual intensification followed , and at 1800 UTC that day Chan @-@ hom reached its peak intensity with maximum sustained winds of 155 km / h ( 100 mph ) and a minimum pressure of 940 mbar ( hPa ; 27 @.@ 76 inHg ) . + Following peak intensity , Chan @-@ hom began to intake dry air beginning on May 25 . At roughly the same time , the typhoon began to weaken and accelerate towards the northeast . Conditions continued to worsen as the storm moved further north , and as the cyclone passed east of Minamitorishima , it was downgraded to tropical storm classification . By this time , Chan @-@ hom had lost much of its convection due to wind shear . Early on May 27 , Chan @-@ hom had fully transitioned into an extratropical cyclone , and these remnants continued to track towards the northeast . These extratropical remnants dissipated south of the Aleutian Islands the following day . Early in the typhoon 's existence , Chan @-@ hom posed a potential threat to Guam , but remained well east of the island . However , after passing to the northeast , winds from the typhoon fanned volcanic ash from the recently erupting Anatahan volcano towards the island , prompting precautionary measures in Guam . Ashfalls were reported on the island , forcing the cancellation of several flights . As a tropical storm , Chan @-@ hom caused some damage to homes and crops on Chuuk , mostly due to heavy rains brought forth by the storm . Offshore , a 1 @,@ 040 ton fishing vessel , the Nien Feioch , sank during the storm . The ship was valued at $ 16 million . + + = = = Severe Tropical Storm Linfa ( Chedeng ) = = = + + Tropical Storm Linfa developed as a tropical depression just off the western coast of Luzon on May 25 . The disturbance quickly intensified to reach tropical storm intensity a few hours after cyclogenesis . However , intensification leveled off as Linfa executed a small clockwise loop before a subsequent landfall on Luzon on May 27 . Due to land interaction the storm temporarily weakened and decoupled before reforming in the Philippine Sea . Afterwards Linfa began reintensifying and reached its peak intensity on May 29 with maximum sustained winds of 100 km / h ( 65 mph ) and a barometric pressure of 980 mbar ( hPa ; 28 @.@ 94 inHg ) . Following its peak the tropical storm began to deteriorate and transitioned into an extratropical cyclone on May 30 ; these extratropical remnants continued to track northward through Japan before dissipating in the Sea of Okhotsk on June 4 . + The erratic and slow movement of Linfa off the western Philippines was the catalyst for extreme rainfall and flooding , killing 41 persons in the archipelago . Precipitation peaked at 723 mm ( 28 @.@ 5 in ) near Dagupan . Rising floodwaters resulted in the temporary shutdown of government offices and numerous mudslides . In addition , strong winds caused widespread power outages . Overall damage from Linfa in the Philippines amounted to ₱ 192 @.@ 3 million ( US $ 3 @.@ 65 million ) . The floods also displaced 8 @,@ 367 people in 1 @,@ 686 families and destroyed 178 homes . Linfa and its extratropical remnants later brought torrential rainfall and widespread flooding to Japan , particularly in southwestern regions . Rainfall there peaked at 727 mm ( 28 @.@ 62 in ) . Flood damage was worst in Kōchi and Tokushima Prefectures , where several buildings were destroyed by floodwater . Other locations in Japan experienced considerable agricultural damage as well as numerous landslides . Overall , Linfa caused roughly $ 28 @.@ 2 million in damage , much of which occurred in Japan , though the entirety of deaths associated with the cyclone took place in the Philippines . + + = = = Severe Tropical Storm Nangka ( Dodong ) = = = + + In late May , an area of disturbed weather began to persist in the South China Sea . The JTWC began to monitor the storm cluster on May 29 . The following day , the JMA reclassified the system as a tropical depression ; initially the system remained highly disorganized due to the lack of deep convection . Persistent moderate wind shear and dry air prevented the cyclone from strengthening significantly in the storm 's early stages . These conditions abated as the depression tracked northeast , and at 0000 UTC on June 1 , the JMA upgraded the system to Tropical Storm Nangka . Throughout the course of the day , Nangka continued to strengthen as it accelerated northeast , and peaked in strength with a barometric pressure of 985 mbar ( hPa ; 29 @.@ 09 inHg ) and maximum sustained winds of 95 km / h ( 60 mph ) , making it a severe tropical storm . + However , upon moving through the Bashi Channel , conditions began to deteriorate due to increased wind shear , weakening the system and resulting in its downgrade to tropical depression status by the JMA at 1200 UTC on June 3 . Nangka continued to become increasingly disorganized as it moved further north , and late that day , the depression transitioned to an extratropical cyclone . The resulting remnants continued to track well east of Japan before dissipating on June 7 . Due to its track away from landmasses , damage remained minimal ; however , as Nangka passed to the south and east of Japan , the storm brought light rainfall to the country , peaking at 81 mm ( 3 @.@ 2 in ) in Minamidaitō , Okinawa . + + = = = Typhoon Soudelor ( Egay ) = = = + + A tropical disturbance persisted in the monsoon trough northwest of Pohnpei on June 7 , and moved westward without development due to wind shear . On June 11 , the shear decreased enough to allow the convection to organize , and the next day the JMA classified it as a tropical depression northeast of Palau . On June 13 , the JMA upgraded it to Tropical Storm Soudelor to the east of the Philippines , and PAGASA gave it the local name " Egay " . Soudelor moved to the northwest and later to the north , parallel to the eastern Philippines , and on June 17 , the JMA upgraded it to typhoon status . The storm rapidly intensified to the east of Taiwan as it developed a well @-@ defined eye , and while doing so passed over the Japanese island of Iriomote @-@ jima at around 2030 UTC on June 17 . At 0600 UTC on June 18 , the JTWC estimated peak 1 minute winds of 215 km / h ( 135 mph ) , while the JMA estimated peak 10 minute winds of 150 km / h ( 90 mph ) . Increased shear weakened the typhoon to tropical storm strengthen on June 19 , and later that day the JMA declared the storm as extratropical near the Oki Islands . The extratropical remnants of Soudelor continued to the northeast , crossing northern Japan on June 20 and dissipating on June 24 . + While offshore the Philippines , Soudelor dropped heavy rainfall that caused flooding and left thousands homeless . The storm caused ₱ 131 million ( PHP , $ 2 @.@ 46 million USD ) in damage and 12 deaths . On the Japanese island of Iriomote @-@ jima , where wind gusts reached 204 km / h ( 127 mph ) . It also affected Taiwan , where floods covered highways and caused mudslides . In Japan , the storm caused widespread power outages , although damage was minimal , and there were 21 injuries . In South Korea , there was $ 12 @.@ 1 million in damage and two deaths . + + = = = Typhoon Imbudo ( Harurot ) = = = + + On July 15 , the JMA estimated that a tropical depression formed , and the next day the JTWC initiated advisories on Tropical Depression 09W about 665 km ( 415 mi ) east of Yap . A subtropical ridge near Okinawa steered the nascent depression to the west @-@ northwest for much of its duration . With warm waters and favorable upper @-@ level conditions , the depression quickly organized , first to Tropical Storm Imbudo on July 17 , and to typhoon status two days later , when PAGASA begin issuing advisories on Typhoon Harurot . Around that time , Imbudo was rapidly intensifying , developing a well @-@ defined eye . At 1200 UTC on July 20 , the JMA estimated peak 10 minute sustained winds of 165 km / h ( 105 mph ) , and the same time , the JTWC estimated 1 minute sustained winds of 240 km / h ( 150 mph ) , making it a super typhoon . Imbudo maintained peak winds for about 12 hours , before undergoing an eyewall replacement cycle . At 0300 UTC on July 22 , Imbudo struck northern Luzon , with 1 minute winds estimated at 205 km / h ( 125 mph ) by the JTWC . It weakened over land , but re @-@ intensified in the South China Sea , striking southern China near Yangjiang , Guangdong on July 24 . Imbudo rapidly weakened , dissipating on July 25 . + In the Philippines , officials evacuated over 14 @,@ 000 people . Imbudo was the strongest typhoon to strike since Typhoon Zeb five years prior , The typhoon left widespread areas flooded for several days . Damage was heaviest in the Cagayan Valley , where over 80 @,@ 000 people were displaced by the storm . In Isabela , high winds wrecked most of the banana crop and severely damaged other crops . Throughout the Philippines , Imbudo damaged or destroyed 62 @,@ 314 houses , causing P4.7 billion ( 2003 PHP , $ 86 million 2003 USD ) in damage . There were 64 deaths in the country . In southern China in Yangjiang , more than 30 @,@ 000 people evacuated ahead of the storm , and more than half of the trees in the city fell due to strong winds . High winds killed a man in Hong Kong after knocking him off a platform . Throughout Guangdong , Imbudo destroyed 595 @,@ 000 houses and caused eight deaths . Heavy rains spread across southern China , peaking at 343 mm ( 13 @.@ 5 in ) at Hepu County in Guangxi province . There , 12 people died from the storm . Overall damage in China was about ¥ 4 @.@ 45 billion ( CNY , $ 297 million USD ) . + + = = = Severe Tropical Storm Koni ( Gilas ) = = = + + Koni originated from a tropical depression situated within the monsoon trough to the east of the Philippines on July 15 . Tracking westward , intensification was slow and the system remained a tropical depression as it moved across the central Philippines on July 17 . Upon moving into the South China Sea , conditions allowed for quicker strengthening , and the cyclone reached tropical storm status on July 18 before reaching its peak intensity with maximum sustained winds of 110 km / h ( 70 mph ) , making it a severe tropical storm . However , atmospheric conditions began to deteriorate as Koni made landfall on Hainan on July 21 , weakening the system . The tropical storm continued to weaken as it moved over the Gulf of Tonkin prior to a final landfall near Hanoi , Vietnam the following day . Tracking inland , the combination of land interaction and wind shear caused Koni to dissipate over Laos on July 23 . + Shortly after development , Koni tracked through the Philippines , killing two people . After moving into the South China Sea , turbulence produced by the storm resulted in an aviation incident involving a commercial airliner off the western Philippines . Three of the plane 's occupants received minor injuries . In Hainan , Koni caused heavy rainfall , peaking at 189 mm ( 7 @.@ 44 in ) at a station on Wuzhi Mountain . The rains resulted in the collapse of 1 @,@ 400 homes and an estimated CN ¥ 140 @.@ 27 million ( US $ 16 @.@ 9 million ) in direct economic losses . Effects were worst in Vietnam , where three people were killed . Widespread power outages occurred , and strong winds resulted in agricultural and infrastructural damage , particularly in Vietnam 's northern provinces . + + = = = Tropical Storm Morakot ( Juaning ) = = = + + Morakot spawned from an area of disturbed weather in the Philippine Sea on July 31 . Tracking northwest , favorable conditions allowed for the intensification of the system to tropical storm strength on August 2 . Morakot reached peak intensity later that day with winds of 85 km / h ( 50 mph ) and a minimum barometric pressure of 992 mbar ( hPa ; 28 @.@ 29 inHg ) . This intensity was held for several hours until less conducive atmospheric conditions slightly weakened the system ; this was followed by Morakot making landfall on southern Taiwan on August 3 . Subsequently , the storm weakened and moved into the Taiwan Strait before making its final landfall near Quanzhou , China the next day . The storm quickly weakened over the Chinese mainland , and dissipated entirely several hours after landfall . + In Taiwan , where Morakot first made landfall , heavy rainfall resulted in flooding . Commercial flights , schools , and rail service in some areas was cancelled in advance of the storm . Precipitation there peaked at 653 mm ( 25 @.@ 71 in ) over a period of nearly two days in Taitung County . Crop damage also resulted from the rainfall , and was estimated at over NT $ 70 million ( US $ 2 million ) . In China , record rainfall was reported . The worst impacted city was Quanzhou , where losses due to Morakot reached CN ¥ 240 million ( US $ 29 million ) and one death was reported . Power outages were also widespread across southeastern China . Due to preexisting drought conditions , 703 cloud seeding operations took place in order to artificially generate added rainfall ; such operations resulted in moderate precipitation over the targeted area . Overall , Morakot caused roughly $ 31 million in damage and three deaths . + + = = = Typhoon Etau ( Kabayan ) = = = + + A tropical depression developed on August 2 southeast of Guam , and gradually intensified while moving to the northwest , becoming a tropical storm on August 3 and a typhoon a day later . Etau formed an eye and became a large storm by the time it approached Okinawa on August 7 . The typhoon attained peak winds of 155 km / h ( 100 mph ) before weakening slightly while turning to the northeast . Etau made landfall on the Japanese island of Shikoku on August 8 , and later moved across portions of Honshu and Hokkaido . After weakening to tropical storm status , the cyclone became extratropical on August 9 and dissipated three days later . + While passing northeast of the Philippines , the typhoon caused light damage in the archipelago . The eye crossed over Okinawa , where Etau left 166 @,@ 800 people without power and caused 10 injuries . Near where Etau first struck Japan , Muroto reported a peak wind gust of 166 km / h ( 103 mph ) , at the time the third strongest on record there . The typhoon also dropped torrential rainfall peaking at 683 mm ( 26 @.@ 9 in ) . The combination of winds and rainfall caused landslides , particularly on Hokkaido . Nationwide , Etau killed 20 people , destroyed 708 houses , and caused ¥ 35 @.@ 1 billion ( JPY , $ 294 @.@ 8 million USD ) in damage . + + = = = Typhoon Krovanh ( Niña ) = = = + + Krovanh originated from a tropical disturbance within the monsoon trough east of Chuuk State on August 13 . Despite rather favorable conditions , the initial tropical depression did not intensify significantly and degenerated into a remnant low on August 18 . However , these remnants were able to reorganize and the system was reclassified as a tropical cyclone a day later . Intensification was rather rapid upon the storm 's reformation – the depression reached tropical storm status on August 20 and then typhoon intensity two days later . Shortly after , Krovanh made landfall on Luzon at peak intensity with winds of 120 km / h ( 75 mph ) . The typhoon emerged into the South China Sea as a much weaker tropical storm , though it was able to restrengthen over warm waters . Once again at typhoon intensity , Krovanh clipped Hainan before moving over the Leizhou Peninsula on its way to a final landfall near Cẩm Phả , Vietnam on August 25 . Quick weakening due to land interaction occurred as Krovanh moved across northern Vietnam , where the storm met its demise the following day . + Krovanh first struck the Philippines , resulting in heavy rainfall and displacing approximately 1 @,@ 000 families . The flooding caused severe damage and killed one person . Krovanh 's effects were much more severe in China . In Hong Kong , eleven people were injured and isolated flooding occurred as a result of the typhoon 's outer rainbands . However , Guangdong Province , Hainan Province , and Guangxi were the Chinese regions most extensively impacted . The typhoon brought record wind gusts into Guangxi . In those three regions combined , 13 @,@ 000 homes were estimated to have collapsed and a large swath of farmland was damaged . Two people were killed in China and economic losses approximated to ¥ 2 @.@ 1 billion ( US $ 253 million ) . Due to its positioning and track , of all areas in Vietnam only the country 's more northern regions were impacted by Krovanh . Flash flooding occurred in earnest in those regions , and 1 @,@ 000 homes were flattened . One person was killed and five others were injured in Vietnam . Overall , the typhoon was responsible for the deaths of four persons . + + = = = Tropical Storm Vamco ( Manang ) = = = + + The monsoon trough spawned several tropical disturbances in the middle of August , one of which became Tropical Depression Lakay near the Philippines . On August 18 , an area of convection persisted on the southern side of a circulation , developing into a tropical depression east of Luzon . It moved quickly northward and later to the northwest in an area generally unfavorable for strengthening , such as the presence of wind shear and land interaction . On August 19 , the JMA upgraded the depression to Tropical Storm Vamco to the east of Taiwan . Later that day , the circulation passed just 55 km ( 35 mi ) north of Taipei , although the convection was exposed that time . On August 20 , the JMA assessed Vamco as dissipating in the Taiwan Strait , although the JTWC continued advisories until the storm moved ashore in southeastern China . + Rainfall in Taiwan reached 69 mm ( 2 @.@ 7 in ) in Ilan County . On the island , the storm left several hundred houses without power due to a lightning strike . On mainland China , rainfall peaked at 101 mm ( 4 @.@ 0 in ) in Wenzhou , Zhejiang , which were largely beneficial in easing drought conditions , while winds gusted to 100 km / h ( 62 mph ) . The storm damaged or destroyed 5 @,@ 880 houses and flooded 1 @,@ 287 ha ( 3 @,@ 180 acres ) of paddy fields , causing ¥ 38 @.@ 6 million ( CNY , $ 4 @.@ 7 million USD ) in damage . + + = = = Typhoon Dujuan ( Onyok ) = = = + + On August 27 , a tropical depression developed about 520 km ( 325 mi ) northwest of Guam , which initially drifted to the southwest before turning to the northwest . On August 29 , the JMA upgraded it to Tropical Storm Dujuan , and that day PAGASA began issuing advisories on Tropical Storm Onyok . The storm quickly intensified into a typhoon , after developing an eye in the center . On September 1 , the JMA estimated Dujuan attained peak 10 minute winds of 150 km / h ( 90 mph ) , and the JTWC assessed peak 1 – minute winds of 230 km / h ( 145 mph ) . While near peak intensity , the center of Dujuan passed about 45 km ( 30 mi ) south of the southern tip of Taiwan . The typhoon weakened steadily and was a severe tropical storm by the time it made landfall on September 2 just east of Hong Kong . The JTWC estimated landfall winds of 185 km / h ( 115 mph ) , making it the strongest typhoon to strike the Pearl River Delta since Typhoon Hope in 1979 . Dujuan rapidly weakened while continuing westward through China , dissipating on September 3 over Guangxi . + In the Philippines , Dujuan interacted with the monsoon to produce heavy rainfall , killing one person . While in the vicinity , Dujuan produced gusts of 100 km / h ( 62 mph ) on Yonaguni , a Japanese subdivision of Okinawa . Heavy rainfall in Taiwan reached 628 mm ( 24 @.@ 7 in ) in Pingtung County , and winds peaked at 176 km / h ( 109 mph ) on Orchid Island before the anemometer was destroyed . The caused about NT $ 200 ( NWD , $ 115 million USD ) in crop damage , and killed three people . Damage was minor in Hong Kong , and four fishermen were missing and presumed drowned after their boat sank . On the Chinese mainland , strong winds left 90 % of the city of Shenzen without power , and killed 16 construction workers due to a half @-@ finished building collapsing . Across Guangdong , the typhoon damaged crops and destroyed 54 @,@ 000 homes were destroyed . Overall damage in China was estimated at ¥ 2 @.@ 3 billion ( CNY , $ 277 million USD ) , and across Guangdong , the typhoon killed 40 people . + + = = = Typhoon Maemi ( Pogi ) = = = + + Typhoon Maemi formed on September 4 from the monsoon trough in the western Pacific Ocean . It slowly intensified into a tropical storm while moving northwestward , and Maemi became a typhoon on September 8 . That day , it quickly intensified due to favorable conditions , developing a well @-@ defined eye and reaching peak maximum sustained winds of 195 km / h ( 120 mph ) . While near peak intensity , Maemi was decelerating and began turning to the north @-@ northeast . The eyewall soon after passed over the Japanese island of Miyako @-@ jima on September 10 , producing the fourth lowest pressure on record in Japan after a pressure of 912 mbar ( 26 @.@ 9 inHg ) was recorded . With warm waters , Maemi was able to maintain much of its intensity before it made landfall just west of Busan , South Korea on September 12 . On Jeju Island , Maemi produced a peak wind gust of 216 km / h ( 134 mph ) and a minimum pressure of 950 mbar ( 28 inHg ) , both setting records for the country , and making it the most powerful typhoon to strike South Korea since record @-@ keeping began in the country in 1904 . The typhoon became extratropical in the Sea of Japan the next day , although the remnants persisted for several more days , bringing strong winds to northern Japan . + The typhoon first affected the Ryukyu Islands of Japan . On Miyako @-@ jima , strong winds damaged 104 buildings , and 95 % of residents lost power . Maemi dropped heavy rainfall there , including rates of 58 @.@ 5 mm ( 2 @.@ 30 in ) in an hour , and 402 @.@ 5 mm ( 15 @.@ 85 in ) in 24 hours , the latter setting a record . One person died on Miyako @-@ jima after being struck by flying glass . Elsewhere in Japan , the storm caused flights to be canceled , while rainfall @-@ induced landslides blocked roads . There were two other deaths in Japan , and damage totaled ¥ 11 @.@ 3 billion ( JPY , $ 96 million USD ) . Damage was heaviest in South Korea , notably where it moved ashore . Winds in Busan near the landfall location reached 154 km / h ( 96 mph ) , the second @-@ highest on record . There , the port sustained heavy damage , causing disruptions to exports in the months following the storm . Nationwide , the high winds destroyed about 5 @,@ 000 houses and damaged 13 @,@ 000 homes and businesses , leaving 25 @,@ 000 people homeless . About 1 @.@ 47 million lost power , and widespread crop damage occurred , resulting in the worst rice crop in 23 years . Across South Korea , Maemi killed 117 people , and overall damage totaled ₩ 5 @.@ 52 trillion won ( $ 4 @.@ 8 billion USD ) . + + = = = Typhoon Choi @-@ wan ( Roskas ) = = = + + In the middle of September , the monsoon trough spawned a rapidly organizing disturbance east @-@ northeast of Luzon , with weak wind shear and favorable conditions . On September 16 , the JMA classified it as a tropical depression , and the JTWC initiated advisories the next day . The system moved to the northwest due to the subtropical ridge to the northeast and later to the north . On September 18 , the JMA upgraded the depression to Tropical Storm Choi @-@ wan , the same day that PAGASA classified it as Tropical Storm Roskas . An eastward @-@ moving trough turned the storm to the northeast , bringing the track over Okinawa and Amami Ōshima on September 19 . Choi @-@ wan continued gradually intensifying , becoming a typhoon on September 20 to the southeast of Japan . That day , the JMA estimated peak winds of 130 km / h ( 80 mph ) , and the JTWC estimated peak 1 minute winds of 185 km / h ( 115 mph ) on September 21 , after Choi @-@ wan developed a well @-@ defined eye . The typhoon weakened due to increasing wind shear , deteriorating to severe tropical storm status on September 22 before JMA declared it extratropical on September 23 . The remnants of Choi @-@ wan continued to the northeast , exited the basin on September 24 , and eventually struck southern Alaska on September 25 . + Wind gusts in Okinawa reached 115 km / h ( 72 mph ) , while on the volcanic island of Hachijō @-@ jima , gusts reached 214 km / h ( 133 mph ) . On the Japanese mainland , winds gusted to 126 km / h ( 78 mph ) at Chōshi , Chiba . Choi @-@ wan dropped heavy rainfall while near Japan , peaking at 316 mm ( 12 @.@ 4 in ) on Miyake @-@ jima . In Okinawa , Choi @-@ wan flooded a boat , forcing its occupants to be rescued by the Coast Guard . Also on the island , heavy rainfall caused landslides and flooded houses . In Amami Ōshima , the storm left 10 @,@ 810 people without power . On Hachijō , wind gusts of 214 km / h ( 133 mph ) damaged about 200 houses . Nationwide , Choi @-@ wan destroyed 191 homes , injured 9 people , and left about ¥ 300 million ( JPY , $ 2 @.@ 5 million USD ) . + + = = = Typhoon Koppu ( Sikat ) = = = + + Towards the end of September , the monsoon trough spawned a tropical disturbance east @-@ northeast of Yap , which became a tropical depression on September 24 . There were initially several circulations , with a tropical upper tropospheric trough to the northeast increasing outflow . After slowing and turning to the northeast , the depression intensified into Tropical Storm Koppu on September 26 . After the storm developed a large eye feature , the JTWC upgraded it to typhoon status on September 27 , although the JMA did not follow suit until the following day while near Chichi @-@ jima . Also that day , Koppu passed 95 km ( 60 mi ) west of Iwo Jima , and the JMA estimated peak 10 minute winds of 130 km / h ( 80 mph ) . The JTWC estimated peak 1 minute winds of 165 km / h ( 105 mph ) , before an approaching trough caused the typhoon to accelerate northeastward . The convection diminished near the center , causing Koppu to become extratropical on September 30 . The remnants continued generally northeastward through the Aleutian Islands , eventually passing south of mainland Alaska on October 7 . + On Chichi @-@ jima , Typhoon Koppu produced sustained winds of 102 km / h ( 63 mph ) , with gusts to 200 km / h ( 124 mph ) , which was the third strongest on record for the station . Rainfall there reached 183 mm ( 7 @.@ 2 in ) . Wind gusts on Iwo Jima peaked at 109 km / h ( 68 mph ) . + + = = = Typhoon Ketsana ( Tisoy ) = = = + + In the middle of October , an area of convection persisted along the monsoon trough between Luzon and Guam , developing into a tropical depression on October 17 . The same monsoon trough later spawned Typhoon Parma to the east . For several days , the system remained disorganized while drifting to the west @-@ northwest due to weak steering currents south of the subtropical ridge . On October 19 , the JMA upgraded the depression to Tropical Storm Ketsana , and by that time the storm had begun drifting to the northeast . With favorable outflow , Ketsana quickly intensified into a typhoon on October 20 after developing an eye , and two days later , the JMA estimated peak winds of 165 km / h ( 105 mph ) . The JTWC estimated peak 1 minute winds of 230 km / h ( 145 mph ) around the time the well @-@ defined eye had expanded to 37 km ( 23 mi ) . Subsequently , the typhoon accelerated northeastward into the westerlies and began weakening due to increasing wind shear and dry air . On October 26 , Ketsana became extratropical to the east of Japan and dissipated the next day . The passage of the typhoon caused surface chlorophyll a concentration in the ocean to increase 30 @-@ fold . + + = = = Typhoon Parma = = = + + The same monsoon trough that spawned Typhoon Koppu also produced an area of convection to the north @-@ northeast of Guam , becoming a tropical depression on October 19 . The system moved northwestward and later turned to the northeast around the subtropical ridge . With low wind shear and favorable outflow , the convection became better organized , and the JMA upgraded it to Tropical Storm Parma on October 21 . After an eye began to form , Parma was upgraded to typhoon status the next day . An approaching trough caused Parma to accelerate northeastward while also increasing outflow . On October 24 , the JMA estimated peak winds of 175 km / h ( 110 mph ) while the JTWC estimated winds of 240 km / h ( 150 mph ) , an unusually high intensity for 30 ° N. Subsequently , Parma rounded the subtropical ridge and began moving to the east @-@ southeast , beginning a nearly week @-@ long loop . The cold front had passed to the north and failed to bring the typhoon northeastward . + Increasing wind shear weakened the convection , and Parma deteriorated into a severe tropical storm on October 26 . The next day , it began moving westward while passing about 345 km ( 215 mi ) north of Wake Island . A large eye of 110 km ( 70 mi ) in diameter developed , and on October 28 , the JMA re @-@ upgraded Parma to typhoon status . The next day , the typhoon turned to the northeast due to another approaching trough . With decreasing wind shear and warmer waters , Parma re @-@ intensified significantly on October 29 , reaching a secondary peak of 165 km / h ( 105 mph ) according to JMA , and 215 km / h ( 135 mph ) according to JTWC . The storm moved very closely along the track it took several days prior . Increasing wind shear on October 30 caused rapid weakening , resulting in the eye dissipating . By the next day , the center was exposed , and Parma became extratropical , later exiting the basin on November 1 . The remnants weakened , later turning to the southeast and dissipating on November 11 southwest of California . + + = = = Severe Tropical Storm Melor ( Viring ) = = = + + Late in October , an area of convection persisted northwest of Palau and quickly organized into a tropical depression on October 29 . Moving west @-@ northwestward toward the Philippines due to a ridge to the east , the depression intensified into Tropical Storm Melor on October 30 . With minimal wind shear , the storm strengthened further and developed a large eye . The JTWC upgraded Melor to typhoon status on October 31 , estimating peak 1 minute winds of 140 km / h ( 85 mph ) , although the JMA estimated the storm only attained peak 10 minute winds of 95 km / h ( 60 mph ) . Early on November 1 , Melor made landfall on northeastern Luzon in the Philippines , south of Palanan . The storm weakened over land and emerged into the South China Sea . By that time , it was moving northward along the periphery of the ridge to the east . On November 2 , Melor turned to the northeast , passing just east of Taiwan . The next day , it weakened to tropical depression status , and after meandering offshore eastern Taiwan , Melor continued northeastward . It became extratropical on November 5 and dissipated the next day near southern Japan . + In the Philippines , Melor dropped about 150 mm ( 6 in ) of rainfall , which flooded the Cagayan River and killed four people . In Taiwan , rainfall reached 554 mm ( 21 @.@ 8 in ) in Pingtung County . On the Japanese island of Hateruma , rainfall totaled 197 mm ( 7 @.@ 8 in ) , which broke the hourly and daily record for the station in November . + + = = = Typhoon Nepartak ( Weng ) = = = + + A tropical depression developed near Yap on November 11 . The system intensified gradually as it tracked quickly westward toward the Philippines . An anticyclone aloft allowed for strengthening , and the JMA upgraded the depression to Tropical Storm Nepartak on November 12 . Simultaneously , the cyclone entered the area of responsibility of the Philippine Atmospheric , Geophysical and Astronomical Services Administration , which named it Tropical Storm Weng . At around 1600 UTC on November 13 , Nepartak made landfall on northern Samar Island in the Philippines before traversing the remainder of the archipelago from east to west . The cyclone emerged into the South China Sea weakened but quickly re @-@ intensified while continuing to the west @-@ northwest . The JTWC estimated peak winds of 140 km / h ( 85 mph ) on November 16 , and later that day , the JMA upgraded the storm to typhoon status , estimating peak 10 @-@ minute winds of 120 km / h ( 75 mph ) . On November 18 , Nepartak passed near southwestern Hainan and weakened , with the convection diminishing from the circulation . By the next day , the system weakened to tropical depression status , and dissipated shortly after moving ashore over Beihai , China . + In the Philippines , Nepartak produced strong winds , heavy rainfall , and rough seas . The storm caused widespread power outages and ferry disruptions . According to the PAGASA in its post @-@ storm report , a total of 13 people lost their lives during the storm . On Hainan , the storm helped end one of the worst summer droughts in almost 65 years , although it also left heavy crop damage , wrecking 64 @,@ 000 ha ( 160 @,@ 000 acres ) of fields and killing 400 head of livestock . With about 800 homes destroyed , damage on Hainan amounted to $ 197 million ( 2003 USD ) . Effects were minor in mainland China . + + = = = Typhoon Lupit ( Yoyoy ) = = = + + Typhoon Lupit formed on November 18 from the monsoon trough to the west of the Marshall Islands . Early in its duration , it moved generally to the west or west @-@ southwest . On November 21 , the depression intensified into Tropical Storm Lupit , and two days later , it strengthened into a typhoon , developing an eye . Lupit later began a prolonged movement to the northwest , during which it passed near several islands in Yap State . The typhoon reached peak intensity on November 26 , with peak 10 – minute sustained winds of 185 km / h ( 115 mph ) . It later weakened due to a variety of unfavorable conditions , and after recurving to the northeast , Lupit became extratropical south of Japan on December 2 . + Typhoon Lupit first affected Pohnpei with gusty winds , and later it damaged or destroyed about 200 homes in Chuuk State . There , high waves flooded roads and homes , while high winds damaged crops . Damage was heaviest in Yap State , mostly in the small Ulithi atoll and Fais Island . On both islands , the typhoon contaminated the water supply and wrecked the crops . Rainfall reached 263 mm ( 10 @.@ 35 in ) on Ulithi , and gusts reached 158 km / h ( 98 mph ) . Throughout the FSM , damage totaled about $ 1 @.@ 7 million , although there were no deaths . The damage prompted the FSM government to declare two states as disaster areas , as well as a disaster declaration from the United States federal government . While Lupit was becoming extratropical , it became the first typhoon in December to threaten Japan in 13 years . The storm dropped rainfall that resulted in mudslides and flight cancellations . + + = = = Other storms = = = + + An area of convection formed on May 16 to the southwest of Palau , located within an area of weak wind shear . The next day , the JMA and the JTWC both classified the system as a tropical depression . The convection was disorganized in association with multiple circulation centers , although it gradually organized . Moving westward initially , the depression turned more to the north into an area of increasing wind shear . On May 19 , the JTWC upgraded the system to Tropical Storm 03W , and on the same day , PAGASA classified it as Tropical Depression Batibot . Soon after , the convection decreased from the center , and by May 20 , all warning agencies had discontinued advisories . + On July 9 , PAGASA classified a system as Tropical Depression Falcon , off the west coast of the Philippines . The JMA also briefly initiated advisories before dropping them later on July 9 . Later in the month , PAGASA briefly issued advisories on Tropical Depression Ineng on July 30 off the east coast of Mindanao . The depression dissipated the next day , causing about P8 million ( PHP , $ 145 @,@ 000 USD ) in damage . The monsoon trough spawned a tropical depression north of Luzon on August 18 , with PAGASA naming it Lakay . There were several circulations in the region , with Tropical Storm Vamco to the northeast near Taiwan , and the overall system moved generally westward . PAGASA briefly classified Lakay as a tropical storm on August 19 before ending advisories the next day . The system spread rainfall across China , reaching 82 @.@ 4 mm ( 3 @.@ 24 in ) in Xiamen , Fujuan . + On September 5 , former Hurricane Jimena crossed the International Date Line into the basin . By that time , the circulation was largely exposed from the convection , and the center quickly dissipated . Later in the month , the monsoon trough spawned a disturbance east of the Philippines that PAGASA classified as Tropical Depression Quiel on September 15 . The system moved westward but never intensified , dissipating west of Luzon on September 19 . The broad system also spawned Typhoon Choi @-@ wan . + In October , the JTWC classified Tropical Depression 18W early in the month off the west coast of Luzon . With weak steering currents , the system moved slowly southwestward before looping to the northwest . On October 10 , the depression dissipated just off the coast of southern China . On October 5 , the JMA monitored a tropical depression southeast of Taiwan that later passed near the island , producing heavy rainfall that peaked at 153 mm ( 6 @.@ 0 in ) in Ilan County . A few days later , the JTWC monitored Tropical Depression 19W , which developed on October 12 after an extratropical storm produced an area of convection . Described as a subtropical low , the depression moved generally northeastward toward Japan due to an approaching cold front . The depression moved through Kyushu and Honshu before dissipating on October 13 . The depression dropped 285 mm ( 11 @.@ 2 in ) of rainfall in Kōchi , while strong winds associated reached 217 km / h ( 135 mph ) through a storm @-@ produced downburst . The winds knocked over two cranes , killing two people , and left about 9 @,@ 000 homes without power . The depression also killed two people due to drownings . On October 16 , the JMA briefly classified a tropical depression to the east of the Marianas Islands . On October 22 , a tropical depression developed in the South China Sea , classified by PAGASA as Ursula . The system moved eastward and crossed Palawan before dissipating on October 24 . In the Philippines , the depression killed one person and caused minor damage . Also in October , the monsoon trough spawned a tropical depression in the Gulf of Thailand , which moved northwestward and crossed into the Indian Ocean , dropping heavy rainfall in Thailand . + In mid @-@ November , the JMA briefly tracked a weak tropical depression near Wake Island . The agency also briefly tracked a tropical depression off the coast of Vietnam on December 16 . It finally dissipated on December 17 , with the pressure and winds unknown . The final system of the year was a tropical depression that originated out of the monsoon trough on December 24 east of the Philippines . After initially moving to the west , it turned to the south , and the JTWC estimated the depression intensified into a tropical storm . During this time , PAGASA classified it as Tropical Storm Zigzag . The system made landfall in northeastern Mindanao and dissipated on December 27 , bringing heavy rainfall . + + = = Storm names = = + + Within the North @-@ western Pacific Ocean , both the Japan Meteorological Agency ( JMA ) and the Philippine Atmospheric , Geophysical and Astronomical Services Administration assign names to tropical cyclones that develop in the Western Pacific , which can result in a tropical cyclone having two names . The Japan Meteorological Agency 's RSMC Tokyo — Typhoon Center assigns international names to tropical cyclones on behalf of the World Meteorological Organization 's Typhoon Committee , should they be judged to have 10 @-@ minute sustained windspeeds of 65 km / h , ( 40 mph ) . While the Philippine Atmospheric , Geophysical and Astronomical Services Administration assigns names to tropical cyclones which move into or form as a tropical depression in their area of responsibility located between 135 ° E and 115 ° E and between 5 ° N @-@ 25 ° N even if the cyclone has had an international name assigned to it . The names of significant tropical cyclones are retired , by both PAGASA and the Typhoon Committee . Should the list of names for the Philippine region be exhausted then names will be taken from an auxiliary list of which the first ten are published each season . Unused names are marked in gray . + + = = = International names = = = + + During the season 21 named tropical cyclones developed in the Western Pacific and were named by the Japan Meteorological Agency , when it was determined that they had become tropical storms . These names were contributed to a list of a 140 names submitted by the fourteen members nations and territories of the ESCAP / WMO Typhoon Committee . + + = = = Philippines = = = + + The Philippine Atmospheric , Geophysical and Astronomical Services Administration uses its own naming scheme for tropical cyclones in their area of responsibility . PAGASA assigns names to tropical depressions that form within their area of responsibility and any tropical cyclone that might move into their area of responsibility . Should the list of names for a given year prove to be insufficient , names are taken from an auxiliary list , the first 10 of which are published each year before the season starts . The names not retired from this list will be used again in the 2007 season . Names that were not assigned are marked in gray . + + = = = Retirement = = = + + The names Imbudo and Maemi were retired by the ESCAP / WMO Typhoon Committee . The names Molave and Mujigae were chosen to replace Imbudo and Maemi respectively . While Yanyan was requested by Hong Kong to be removed in the list , and was replaced by Dolphin . The Philippine Atmospheric , Geophysical and Astronomical Services Administration ( PAGASA ) announced that the name Harurot had its name retired due to extensive damage . The name Hanna was chosen to replace Harurot . Also , the name " Koni " was replaced by " Goni " , after it was found that Koni was a misspelling . + + = = Storm effects = = + + The following table provides basic meteorological and impact information for each tropical cyclone from the 2003 Pacific typhoon season in tabular format ; unnamed tropical cyclones are not included . PAGASA names for storms are provided in parentheses . Storms entering from the Central Pacific only include their information while in the western Pacific , and are noted with an asterisk * . + + + = Oxaziridine = + + An oxaziridine is an organic molecule that features a three @-@ membered heterocycle containing oxygen , nitrogen , and carbon . In their largest application , oxazidines are intermediates in the industrial production of hydrazine . Oxaziridine derivatives are also used as specialized reagents in organic chemistry for a variety of oxidations , including alpha hydroxylation of enolates , epoxidation and aziridination of olefins , and other heteroatom transfer reactions . Oxaziridines also serve as precursors to amides and participate in [ 3 + 2 ] cycloadditions with various heterocumulenes to form substituted five @-@ membered heterocycles . Chiral oxaziridine derivatives effect asymmetric oxygen transfer to prochiral enolates as well as other substrates . Some oxaziridines also have the property of a high barrier to inversion of the nitrogen , allowing for the possibility of chirality at the nitrogen center . + + = = History = = + + Oxaziridine derivatives were first reported in the mid @-@ 1950s by Emmons and subsequently by Krimm and Horner and Jürgens . Whereas oxygen and nitrogen typically act as nucleophiles due to their high electronegativity , oxaziridines allow for electrophilic transfer of both heteroatoms . This unusual reactivity is due to the presence of the highly strained three membered ring and the relatively weak N @-@ O bond . Nucleophiles tend to attack at the aziridine nitrogen when the nitrogen substituent is small ( R1 = H ) , and at the oxygen atom when the nitrogen substituent has greater steric bulk . The unusual electronics of the oxaziridine system may be exploited to perform a number of oxygen and nitrogen transfer reactions including , but not limited to : α @-@ hydroxylation of enolates , epoxidation of alkenes , selective oxidation of sulfides and selenides , amination of N @-@ nucleophiles and N @-@ acylamidation . + The Peroxide process for the industrial production of hydrazine through the oxidation of ammonia with hydrogen peroxide in the presence of ketones was developed in the early 1970s . + Chiral camphorsulfonyloxaziridines proved useful in the syntheses of complex natural product , such as taxol which is marketed as a chemotherapy agent . Both the Holton Taxol total synthesis and the Wender Taxol total synthesis feature asymmetric α @-@ hydroxylation with camphorsulfonyloxaziridine . + + = = Synthesis = = + + + = = = N @-@ H , N @-@ Alkyl , N @-@ Aryloxaziridines = = = + + The two main approaches to synthesis of N @-@ H , N @-@ alkyl , and N @-@ aryloxaziridines are oxidation of imines with peracids ( A ) and amination of carbonyls ( B ) . + Additionally , oxidation of chiral imines and oxidation of imines with chiral peracids may yield enantiopure oxaziridines . Some oxaziridines have the unique property of configurationally stable nitrogen atoms at room temperature due to an inversion barrier of 24 to 31 kcal / mol . Enantiopure oxaziridines where stereochemistry is entirely due to configurationally stable nitrogen are reported . + + = = = N @-@ Sulfonyloxaziridines = = = + + In the late 1970s and early 1980s Franklin A. Davis synthesized the first N @-@ sulfonyloxaziridines , which act exclusively as oxygen transfer reagents , and are the most predominantly used class of oxaziridines today . While originally synthesized with mCPBA and the phase transfer catalyst benzyltrimethylammonium chloride , an improved synthesis using oxone as the oxidant is now most prevalent . + Many N @-@ sulfonyloxaziridines are used today , each with slightly different properties and reactivity . These reagents are summarized in the table below . + + = = = Perfluorinated oxaziridines = = = + + With highly electron withdrawing perfluoroalkyl substituents , oxaziridines exhibit reactivity more similar to that of dioxiranes than typical oxaziridines . Notably , perfluoroalkyloxaziridines hydroxylate certain C @-@ H bonds with high selectivity . Perfluorinated oxaziridines may be synthesized by subjecting a perfluorinated imine to perfluoromethyl fluorocarbonyl peroxide and a metal fluoride to act as an HF scavenger . + + = = Reactions of oxaziridines = = + + + = = = Hydrazine production = = = + + Oxaziridines are intermediates in the Peroxide process for the production of hydrazine . Many millions of kilograms of hydrazine are produced annually by this method that involves a step wherein ammonia is oxidized in the presence of methyl ethyl ketone to give the oxaziridine : + Me ( Et ) C = O + NH3 + H2O2 → Me ( Et ) CONH + H2O + In subsequent steps the oxaziridine is converted to the hydrazone , which is the immediate en route to hydrazine : + Me ( Et ) CONH + NH3 → Me ( Et ) C = NNH2 + H2O + + = = = Oxygen transfer = = = + + + = = = = α @-@ Hydroxylation of enolates = = = = + + α @-@ Hydroxyketones , or acyloins , are an important synthetic motifs present in many natural products. α @-@ Hydroxyketones have been synthesized in many ways , including reduction of α @-@ diketones , substitution of a hydroxyl for a leaving group and direct oxidation of an enolate . Oxodiperoxymolybdenum ( pyridine ) - ( hexamethylphosphoric triamide ) ( MoOPH ) and N @-@ sulfonyloxaziridines are the most common electrophilic sources of oxygen implemented in this process . One advantage of using N @-@ sulfonyloxaziridines is that higher chiral induction is almost invariably observed relative to MoOPH and other oxidants . High yield ( 77 @-@ 91 % ) and dr ( 95 : 5 - 99 : 1 ) are reported for α @-@ hydroxylation with the Evans ' chiral auxiliary with N @-@ sulfonyloxaziridine as the electrophile . Chiral induction has been demonstrated with many other chiral ketones and ketones with chiral auxiliaries , including SAMP and RAMP . + Extensive work has been reported on asymmetric hydroxylation of prochiral enolates with camphorsulfonyloxaziridine derivatives , achieving moderate to high enantiomeric excess . The commonly accepted proposed transition state that justifies this stereochemical outcome involves an open transition state where the steric bulk of R1 determines the face of approach . + The selectivity of some hydroxylations may be drastically improved in some cases with the addition of coordinating groups alpha to the oxaziridine ring as oxaziridines 3b and 3c in the table above . In these instances it is proposed that the reaction proceeds through a closed transition state where the metal oxyanion is stabilized by chelation from the sulfate and coordinating groups on the camphor skeleton . + α @-@ Hydroxylation with oxaziridines has been widely implemented in total synthesis . It is a key step in both the Holton Taxol total synthesis and the Wender Taxol total synthesis . Additionally , Forsyth implemented the transformation in his synthesis of the C3 @-@ C14 ( Substituted 1 @,@ 7 @-@ Dioxaspiro [ 5 @.@ 5 ] undec @-@ 3 @-@ ene ) System of Okadaic acid . + + = = = = Epoxidation of alkenes = = = = + + Epoxidation of alkenes is a common reaction because epoxides can be derivatized in a number of useful ways . Classically , laboratory epoxidation is carried out with mCPBA or other peracids . Oxaziridines have been found to be useful for the formation of highly acid sensitive epoxides . ( − ) -Chaetominine was synthesized via oxaziridine epoxidation as a late stage transformation as seen below . + Another transformation of high synthetic utility is asymmetric epoxidation . A number of asymmetric epoxidations exist : the Sharpless epoxidation , the Jacobsen @-@ Katsuki epoxidation , and the Juliá @-@ Colonna Epoxidation . These methods require specific functionality in order to achieve selectivity . The Sharpless epoxidation is specific to allylic alcohols , the Jacobsen epoxidation requires cis @-@ disubstituted aryl alkenes , and the Juliá epoxidation requires α @-@ β unsaturated ketones . Chiral oxaziridines act stereospecifically on many unfunctionalized alkenes . It has even possible to effect stereospecific epoxidation catalytically in the oxaziridine chiral unit . Further investigation into these reactions may be required before levels of enantiometic excess become practical for large scale synthesis . Lusinichi et al. have investigated asymmetric epoxidation with a chiral oxaziridinium salt using oxone as the stoichiometric oxidant seen below . + + = = = = Hydroxylation of unactivated hydrocarbons = = = = + + Perfluorinated oxaziridines are known to hydroxylate unactivated hydrocarbons with remarkable regio , and diastereospecificity . This is a highly coveted transformation , and similar reactivity and specificity is seldom rivaled , especially considering the nonmetallic nature of the oxidant . Perfluorinated oxaziridines show high selectivity toward tertiary hydrogens . Hydroxylation of primary carbons and dihydroxylation of a compound with two oxidizable sites have never been observed . Retention of stereochemistry is very high , often 95 - 98 % . ( retenton of stereochemistry may be further enhanced by the addition of a fluoride salt ) . + + = = = Nitrogen transfer = = = + + Oxaziridines with unsubstituted or acylated nitrogens are capable of nitrogen atom transfer , although this reactivity has received considerably less attention . + + = = = = Amination of N @-@ nucleophiles = = = = + + Amination of nucleophiles with N @-@ unsubstituted oxaziridines is quite versatile in the breadth of possible nucleophiles and corresponding products . Hydrazines may be derived from the amination of secondary or tertiary amines , hydroxylamine and thiohydroxamines may be formed from their corresponding alcohols and thiols , sulfimides may be formed from thioethers and α @-@ aminoketones may be formed by attack of corresponding enolates . + + = = = = N @-@ acylamidation = = = = + + The transfer of acylated amines is more difficult than that of unsubstituted amines , although , unlike amine transfer by oxaziridines , there are no alternative methods that directly transfer acylated amines . Acylamine transfer has primarily been performed using amines and hydrazines as nucleophiles . Very few transfers of acylated nitrogens to carbon nucleophiles have been successfully performed , although some do exist in the literature . + + = = = Rearrangements = = = + + Oxaziridines have been found to undergo rearrangement reactions via a radical mechanism when irradiated with UV light or in the presence of a single electron transfer reagent such as CuI. spirocylic oxaziridines undergo ring expansions to the corresponding lactam . Interestingly , the migrating substituent is determined by a stereoelectronic effect where the group trans to the lone pair on the nitrogen will always be the predominant migration product . In light of this effect , it is possible to take advantage of the chiral nitrogen due to high inversion barrier to direct the rearrangement . This phenomenon is demonstrated by observed selectivities in the rearrangements below . In the rearrangement on the left the thermodynamically unfavorable product is observed exclusively , while in the reaction on the right the product derived from the less stable radical intermediate is favored . + Aubé takes advantage of this rearrangement as the key step in his synthesis of ( + ) -yohimbine , a natural medicine classified by the NIH as possibly effective in the treatment of erectile dysfunction and the sexual problems caused by selective serotonin reuptake inhibitors . + It is also notable that oxaziridines will thermally rearrange to nitrones . Cis @-@ trans selectivity of the resulting nitrone is poor , however , yields are good to excellent . It is thought that some oxaziridines racemize over time through a nitrone intermediate . + + = = = Cycloaddions with heterocumulenes = = = + + Oxaziridines undergo cycloaddition reactions with heterocumulenes to afford a number of unique five membered heterocycles , as depicted in the figure below . This reactivity is due to the strained three membered ring and weak N @-@ O bond . + + + = Battle of Dürenstein = + + The Battle of Dürenstein ( also known as the Battle of Dürrenstein , Battle of Dürnstein and Battle of Diernstein ; German : Gefecht bei Dürrenstein ) , on 11 November 1805 was an engagement in the Napoleonic Wars during the War of the Third Coalition . Dürenstein ( modern Dürnstein ) is located in the Wachau Valley , on the River Danube , 73 kilometers ( 45 mi ) upstream from Vienna , Austria . The river makes a crescent @-@ shaped curve between Dürnstein and nearby Krems an der Donau and the battle was fought in the flood plain between the river and the mountains . + At Dürenstein a combined force of Russian and Austrian troops trapped a French division commanded by Théodore Maxime Gazan . The French division was part of the newly created VIII Corps , the so @-@ called Corps Mortier , under command of Édouard Mortier . In pursuing the Austrian retreat from Bavaria , Mortier had over @-@ extended his three divisions along the north bank of the Danube . Mikhail Illarionovich Kutuzov , commander of the Coalition force , enticed Mortier to send Gazan 's division into a trap and French troops were caught in a valley between two Russian columns . They were rescued by the timely arrival of a second division , under command of Pierre Dupont de l 'Étang . The battle extended well into the night . Both sides claimed victory . The French lost more than a third of their participants , and Gazan 's division experienced over 40 percent losses . The Austrians and Russians also had heavy losses--close to 16 percent--but perhaps the most significant was the death in action of Johann Heinrich von Schmitt , one of Austria 's most capable chiefs of staff . + The battle was fought three weeks after the Austrian capitulation at Ulm and three weeks before the Russo @-@ Austrian defeat at the Battle of Austerlitz . After Austerlitz Austria withdrew from the war . The French demanded a high indemnity and Francis II abdicated as Holy Roman Emperor , releasing the German states from their allegiance to the Holy Roman Empire . + + = = Background = = + + In a series of conflicts from 1803 @-@ 15 known as the Napoleonic Wars , various European powers formed five coalitions against the First French Empire . Like the wars sparked by the French Revolution ( 1789 ) , these further revolutionized the formation , organization and training of European armies and led to an unprecedented militarization , mainly due to mass conscription . Under the leadership of Napoleon , French power rose quickly as the Grande Armée conquered most of Europe , and collapsed rapidly after the disastrous invasion of Russia in 1812 . Napoleon 's empire ultimately suffered complete military defeat in the 1813 – 14 campaigns , resulting in the restoration of the Bourbon monarchy in France . Although Napoleon made a spectacular return in 1815 , known as the Hundred Days , his defeat at the Battle of Waterloo , the pursuit of his army and himself , his abdication and banishment to the Island of Saint Helena concluded the Napoleonic Wars . + + = = Danube campaign = = + + From 1803 @-@ 06 the Third Coalition fought the First French Empire and its client states ( see table at right ) . Although several naval battles determined control of the seas , the outcome of the war was decided on the continent , predominantly in two major land operations in the Danube valley : the Ulm campaign in the upper Danube and the Vienna campaign , in the middle Danube valley . + Political conflicts in Vienna delayed Austria 's entry into the Third Coalition until 1805 . After hostilities of the War of the Second Coalition ended in 1801 , Archduke Charles--the emperor 's brother--took advantage of the subsequent years of peace to develop a military restructuring plan . He carefully put this plan into effect beginning in 1803 – 04 , but implementation was incomplete in 1805 when Karl Mack , Lieutenant Field Marshal and Quartermaster @-@ General of the Army , implemented his own restructuring . Mack bypassed Charles ' methodical approach . Occurring in the field , Mack 's plan also undermined the overall command and organizational structure . Regardless , Mack sent an enthusiastic report to Vienna on the military 's readiness . Furthermore , after misreading Napoleon 's maneuvers in Württemberg , Mack also reported to Vienna on the weakness of French dispositions . His reports convinced the war party advising the emperor , Francis II , to enter the conflict against France , despite Charles ' own advice to the contrary . Responding to the report and rampant anti @-@ French fever in Vienna , Francis dismissed Charles from his post as generalissimo and appointed his Francophobic brother @-@ in @-@ law , Archduke Ferdinand , as commander . + The inexperienced Ferdinand was a poor choice of replacement for the capable Charles , having neither maturity nor aptitude for the assignment . Although Ferdinand retained nominal command , day @-@ to @-@ day decisions were placed in the hands of Mack , equally ill @-@ suited for such an important assignment . When Mack was wounded early in the campaign , he was unable to take full charge of the army . Consequently , command further devolved to Lieutenant Field Marshal Karl Philipp , Prince of Schwarzenberg , an able cavalry officer but inexperienced in the command of such a large army . + + = = = Road to Ulm = = = + + The campaign in the upper Danube valley began in October , with several clashes in Swabia . Near the Bavarian town of Wertingen , 40 kilometers ( 25 mi ) northwest of Augsburg , on 8 October the 1st Regiment of dragoons , part of Murat 's Reserve Cavalry Corps , and grenadiers of Lannes ' V Corps surprised an Austrian force half its size . The Austrians were arrayed in a line and unable to form their defensive squares quickly enough to protect themselves from the 4 @,@ 000 dragoons and 8 @,@ 000 grenadiers . Nearly 3 @,@ 000 Austrians were captured and over 400 were killed or wounded . A day later , at another small town , Günzburg--immediately south of the Danube River--the French 59th Regiment of the Line stormed a bridge over the Danube and , humiliatingly , chased two large Austrian columns toward Ulm . + The campaign was not entirely bad news for Vienna . At Haslach , Johann von Klenau arranged his 25 @,@ 000 infantry and cavalry in a prime defensive position and , on 11 October , the overly confident General of Division Pierre Dupont de l 'Étang attacked Klenau 's force with fewer than 8 @,@ 000 men . The French lost 1 @,@ 500 men killed and wounded . Aside from taking the Imperial Eagles and guidons of the 15th and 17th Dragoons , Klenau 's force also captured 900 men , 11 guns and 18 ammunition wagons . + Klenau 's victory was a singular success . On 14 October Mack sent two columns out of Ulm in preparation for a breakout to the north : one under Johann Sigismund Riesch headed toward Elchingen to secure the bridge there , and the other under Franz von Werneck went north with most of the heavy artillery . Recognizing the opportunity , Marshal Michel Ney hurried the rest of his VI Corps forward to re @-@ establish contact with Dupont , who was still north of the Danube . In a two @-@ pronged attack Ney sent one division to the south of Elchingen on the right bank of the Danube . This division began the assault at Elchingen . At the same time another division crossed the river to the east and moved west against Riesch 's position . After clearing Austrian pickets from a bridge , the French attacked and captured a strategically located abbey at the top of the hill at bayonet point . The Austrian cavalry unsuccessfully tried to fend off the French , but the Austrian infantry broke and ran . In this engagement alone , the Austrians lost more than half their reserve artillery park , 6 @,@ 000 ( out of 8 @,@ 000 total participants ) dead , wounded or captured and four colors . Reisch 's column also failed to destroy the bridges across the Danube . + Napoleon 's lightning campaign exposed the Austrian indecisive command structure and poor supply apparatus . Mack completely misread the French dispositions and scattered his forces ; as the French defeated each unit separately , the surviving Austrians withdrew toward the Ulm fortifications . Napoleon arrived to take personal command of close to 80 @,@ 000 men . At Ulm on 16 October Karl Mack surrendered his encircled army of 20 @,@ 000 infantry and 3 @,@ 273 cavalry . The officers were released on the condition that they not serve against France until formally exchanged for French officers captured by the Austrians , an agreement to which they held . + + = = Prelude to battle = = + + The few Austrian corps not trapped at Ulm withdrew toward Vienna , with the French in close pursuit . A Russian army under Gen. Mikhail Kutuzov also maneuvered away from the French , withdrawing to the east . At the Ill river on 22 October it joined with the retreating Austrian corps commanded by Michael von Kienmayer . On 5 November the Coalition forces held a successful rearguard action in Amstetten . On 7 November the Russians arrived in St. Pölten and crossed the Danube river the next day . Late on 9 November they destroyed the bridges across the Danube , holding the last one at the hamlet of Stein , near the village Krems , until the late afternoon . + + = = = Battlefield = = = + + To the east of Stein , 2 kilometers ( 1 @.@ 2 mi ) down an old road , lay Krems , with its small population of a few hundred , at the confluence of the stream of that name and the Danube . To the west of Stein the Danube made a large curve , creating a crescent @-@ shaped floodplain between it and the mountains . At the far western end of the floodplain , where the mountains came down almost to the river 's edge , was Dürenstein with its castle , known as Schloss Dürenstein . The castle had served as a prison for Richard I of England in 1193 . In 1645 – 46 , during the Thirty Years War , the Swedes had fortified the castle and then demolished it when they withdrew . It stands at 159 meters ( 522 ft ) , on the highest ridge of a mountain fissured with clefts and pinnacles of granite . Because the mountain was sparsely vegetated , it was difficult to distinguish the ruins from the rocks . Narrow canyons cut through the mountain , and widen into the plain below . Between Dürenstein and Stein , on the flood plain , lay the hamlets of Oberloiben and Unterloiben . Near the hamlets , the Loiben flood plain was at its widest , extending at the most 762 meters ( 2 @,@ 500 ft ) from the base of the Loibenberg mountain to the bank of the river . + The region was known for its wine . Since the 15th century the local inhabitants practiced viticulture and the wine producers formed St. Paul Vintners ' Guild in 1447 , the oldest such guild in the German @-@ speaking world . Terraced vineyards extended up the sides of the Krems River until it became a mountain stream and terrain was unsuitable for cultivation . The Loiben plain supported both viticulture and agriculture . As the terrain became steeper , the vines grew in terraces built from the dark Urgestein , primordial rock . From Dürenstein to Krems the river makes its wide curve ; the mountains and the steeply terraced slopes prevent clear line @-@ of @-@ sight between the two towns . + + = = = Dispositions = = = + + Napoleon had calculated that Kutuzov would withdraw toward Vienna , expecting reinforcements from Russia ; he envisioned that the armies would engage in a great battle at Vienna , and that this battle would decide the war . Consequently , Napoleon drew divisions from four of the other seven corps of the Grande Armée to create a new VIII Corps . This corps was to secure the north shore of the Danube , block any of the Austrian or Russian groups from reinforcing one another and , more importantly , prevent Kutuzov from crossing the river and escaping to Russia . + The new VIII Corps , under the overall command of Édouard Mortier , included three infantry divisions and a division of cavalry ( see Order of Battle below ) . Corps Mortier , as it was known , crossed the Danube at Linz and Passau in early November 1805 and marched east , on the north bank of the Danube . Operating independently , the corp 's cavalry conducted reconnaissance ahead of them and on the flanks . Gen. Gazan 's division ( about 6 @,@ 000 men ) took the lead ; Mortier was with them . They were followed by Gen. Dupont 's division ( another 4 @,@ 000 ) about one day 's march behind . Jean @-@ Baptiste Dumonceau 's division ( another 4 @,@ 000 ) , marching another day behind Dupont , brought up the rear . A flotilla of 50 boats acquired at Passau provided communications across the Danube . Before sending Mortier on his mission , Napoleon instructed him to protect his north flank at all times against possible Russian reinforcements , advice he reiterated in subsequent written orders . Napoleon also advised Mortier to secure all crossings of the Danube between Linz and Vienna . + On 9 November Gazan 's division reached Marbach an der Donau and covered the 50 kilometers ( 31 mi ) to Dürenstein by early on the following afternoon . Here it skirmished with some Russian patrols to the east of the town and expelled them . Feeling confident , the French established a forward post just upstream from Stein . In Dürenstein itself Mortier set up his command post and directed the establishment of a small field hospital . Although the position seemed secure , he had ignored Napoleon 's strict instructions and neglected to protect his left ( north ) flank . + This failure was an important factor when Mortier lost his corps ' so @-@ called " eyes " : after he and Gazan had crossed the Danube , the French dragoons had veered to the northwest , leaving only three squadrons of the 4th Dragoons available for reconnaissance . These had left the division and were operating independently of Gazan 's command . Consequently , Mortier and Gazan marched blindly through the narrow canyon west of Dürenstein , not knowing what lay ahead of them . Kutuzov had led the Coalition army across the Danube at Krems , a short distance past Stein , and destroyed the bridge behind him . His actions deprived the French commanders of a possible route across the Danube , putting the deployment of the entire French division at further risk in the case of retreat . In this decision Kutuzov abandoned Vienna to the French , who were converging on the Austrian capital from the north , west and southwest , for the security of uniting with reinforcements from Galicia . Kutuzov chose a military solution over a political one . + Unknown to either Gazan or Mortier , the Coalition had concentrated a force of approximately 24 @,@ 000 men ( mostly Russians and a few Austrians ) within a few kilometers of the French position at Dürenstein . In comparison , Gazan 's division had only 6 @,@ 000 men . The Austro @-@ Russian force was a mixture of infantry , Jägers ( usually deployed as skirmishers ) , Russian musketeers and Russian and Austrian cavalry , accompanied by more than 68 artillery pieces . Kutuzov , who had learned the military arts under the tutelage of the legendary Russian Generalissimo Suvorov , had overall command . The Russian cavalry , units of the greatly feared Cossacks , were well @-@ suited for patrolling the river bank ; indeed , on 9 November they had taken 40 French soldiers as prisoners . Furthermore , reinforcements stood in Moravia , less than two weeks ' march away . If the main body of the French army crossed the river , they would require time to prepare . Kutuzov would have ample warning of any large @-@ scale French movement . + After the afternoon 's initial skirmishing with the French , Kutuzov held a council of war on the evening of 10 November at Melk , at the great abbey there . He knew several things . First , he knew the positions of the French from prisoners his Cossacks had captured . He also knew that Gazan had crossed at Linz and was well ahead of any French reinforcements : Dupont had crossed at Passau and , by 10 November , stood at Marbach , 50 kilometers ( 31 mi ) upstream , and Dumonceau was another 7 kilometers ( 4 mi ) further behind him . Kutuzov knew the size of the French force--its division strength--and its positions , and he knew that most of the dragoons were not covering the French flank but had turned north . He also knew , or had made a good supposition , about Napoleon 's orders , so he knew what to offer Mortier and Gazan as bait . + + = = = Battle plan = = = + + In addition to the Russian generals , the council included Austrian commanders Lieutenant Field Marshal Johann Heinrich von Schmitt and Friedrich Karl Wilhelm , Fürst zu Hohenlohe . Schmitt , who had retired from the military in 1800 , had been recalled into service after the Ulm debacle and had come to Kutuzov highly recommended by the Emperor . He was an experienced tactician and strategist and had served in a variety of posts in the Habsburg military ; he had been Archduke Charles ' trusted adviser during the campaigns from 1796 to 1800 and had assisted in planning several of Charles ' victories . Upon his recall , Schmitt was appointed Chief of the Quartermaster General Staff of the Coalition Army . The generals had found among the Austrian force one Capt. Christoph Freiherr von Stiebar ( 1753 – 1824 ) , who had knowledge of the local geography . + Together , Schmitt , Kutuzov and the other generals , with von Stiebar 's advice on the local terrain , concocted a plan to encircle the French at Dürenstein . Russian commander Mikhail Andreyevich Miloradovich would approach Gazan 's division from the east , supported by Petr Bagration 's corps , and pin the French in place . Three additional columns , commanded by Dmitry Dokhturov ( Doctorov ) , Maj. Gen. Strik and Schmitt , would outflank the French from the west and the north . They would offer , as bait , a rumor : the Russian army was retreating into Moravia and only a rearguard would be left at Krems . + + = = Battle = = + + On the night of 10 – 11 November a Russian column under Strik 's command began its passage through the narrow canyons , intent on arriving at Dürenstein by noon ; two more columns , under Dokhtorov and Schmitt , moved in wider semicircles , planning to pass through the mountains and attack the French , who were extended along the river bank . According to the plan , in late morning Strik 's column would emerge from the mountains first and launch a flanking assault on the French right . This flanking attack , combined with Miloradovich 's frontal assault from Stein , would force the French into a vise ; encircled , they would have no option but to surrender--or die . To ensure the success of the plan , the second and third columns , under Dokhtorov and Schmitt , would arrive in early and mid @-@ afternoon and support the earlier assaults . In this way , even if the French tried to retreat west to Marbach , they would not escape the vise @-@ like grip of the Coalition army . + Mortier accepted the bait of a rumored Russian retreat . In the early morning of 11 November he and Gazan departed from Dürenstein to seize Stein and Krems , presuming the Russians had either abandoned the settlements or left only a small rear @-@ guard behind . As they approached Stein , a column of Miloradovich 's troops attacked the French forward positions . Thinking this force was the rumored Russian rear guard , Mortier ordered Gen. Gazan to counterattack and push east towards the town of Stein . Fighting spread though the villages of Oberloiben , Unterloiben and the farm at Rothenhof . Instead of withdrawing , as a rear guard would , more and more Russian troops appeared and engaged the French column . + Initially Gazan made rapid progress , but he quickly recognized that the opposing force was much stronger than the typical rear guard of a retreating army . Realizing he had been duped and that Gazan 's troops were tiring rapidly , Mortier sent orders to Dupont 's division to hurry forward . By mid @-@ morning the French momentum had stalled ; Mortier committed most of his remaining forces to driving Miloradovich back , leaving a single battalion--perhaps 300 troops--to cover his northern flank , and sent the rest to attack the Russian right . Within 30 minutes he achieved the superiority of numbers he sought . His 4 @,@ 500 French opposed 2 @,@ 600 Russians and forced them back toward Stein while pressing an attack along the river . Miloradovich had no option , for neither Strik 's nor Dokhtorov 's flanking columns were to be seen . + At this stage of the battle fighting paused . Mortier and Gazan waited for Dupont 's arrival while Kutuzov and Miloradovich waited for Strik 's and Dokhturov 's . Schmitt 's column was expected to be the last to join the fight because it had to march the greatest distance . The timing of the respite varies , depending on whose reports are consulted : fighting paused at around 12 : 00 or 14 : 00 . Strik arrived first and immediately assaulted Gazan 's line with three battalions , pushing the French out of Dürenstein . Caught between two strong forces , Gazan attempted to push his way back through Dürenstein , to reach the river where the flotilla could evacuate his exhausted troops . Withdrawing through the narrow Danube canyon and fighting off the Russian force at their rear , Gazan and his division were trapped when more of Strik 's Russians appeared to block their retreat . The narrow defiles hampered the Russians ; Strik 's men had to march out of the canyons , form ranks and attack in waves . Despite Strik 's continuous assault in the next two to three hours , Mortier and Gazan pushed the Russians back up the narrow fissure in the hillside . At this point , Dokhturov 's column appeared behind the French line and joined the battle . The French were outnumbered more than three to one , assaulted in the front by Miloradovich 's column , in the middle by Strik 's and in the rear by Dokhturov . + Earlier in the morning Dupont had proceeded with his column south and east along the river , from Marbach , according to instructions . Even before the arrival of Mortier 's courier , he heard the sound of artillery in the distance and sent riders ahead to discover the cause . They came back to report that a Russian column ( Dokhturov 's ) was descending from the mountains to take the road to Dürenstein . Realizing this would separate him from the forward division , Dupont hustled his troops toward the sound of battle and deployed them to take the Russians in the flank . The French assault , heralded by cannon fire , caused Dokhturov 's troops to turn their attention from Gazan 's beleaguered force to face these new assailants . Although superior in numbers , Dokhturov 's column had no supporting artillery , and the narrow space prevented them from taking advantage of their size . It was Dokhturov 's turn to face attackers at his front and rear , until the arrival of Schmitt 's column , which had wended its way through the mountains in the west . + Schmitt arrived at dusk , and the action continued well after dark ; in mid @-@ November night falls at close to 17 : 00 in the upper Danube climes . Despite the darkness , Schmitt descended out of the defiles and deployed his troops to assail Dupont 's flank . As his Russians entered the fray , they came between a battalion of French and another of Russians . With the additional force , the French were overwhelmed , but most of the shooting subsided when the combatants could not tell apart friend from foe in the dark . Under the cover of darkness , Mortier used the French flotilla to evacuate his exhausted troops to the south bank . The French and Russians continued to skirmish fitfully into the night as sentries encountered one another in the dark . Portions of Gazan 's force provided any necessary rear guard action , and the following morning the remaining men were evacuated from the north shore of the Danube , while they maintained possession of only Spitz and Weissenkirchen on the north bank . + + = = = Losses = = = + + The losses were staggering : Gazan lost close to 40 percent of his division to death and wounds . Aside from losing five guns , 47 officers and 895 men under his command were captured , bringing the loss of effectives closer to 60 percent ; furthermore , he lost the eagles of the 4th Infantry Regiment ( France ) and the eagle and guidon of the 4th Dragoons . The Russians lost around 4 @,@ 000 , about 16 percent of their force , and two regimental colors . The Austrian Lieutenant Field Marshal Schmitt was killed as the battle concluded , probably by Russian musketry in the confused melee . The vineyards and the villages of Ober- and Unterloiben were destroyed , as was most of Dürenstein and Stein . Krems was heavily damaged ; the French plundered the town at least twice , and " barbarously handled " its inhabitants . + + = = Aftermath = = + + Both sides claimed victory . Although losses were fairly equal in terms of numbers--4,000 wounded or dead on each side--the Coalition forces went into battle with 24 @,@ 000 men while the French started with Gazan 's division of about 6 @,@ 000 , which grew close to 8 @,@ 000 when Dupont 's men joined the fighting in the afternoon . Regardless , Gazan 's division was nearly destroyed ; the 30 percent losses experienced by the French fell predominantly on his division . Clearly for both sides , the fighting was hard . The weather had been cold ; an early storm had left slick icy mud in the roadways , and icicles " like chandeliers " hung from the trees . + For the Coalition , the Russians were secure on the north bank of the Danube , awaiting reinforcements from Galicia ; the bridges between Linz and Vienna had been destroyed , making French access to the Austrian capital more difficult , but not impossible . After six months of fighting in which the Austrians had enjoyed little good news , the Coalition could claim a difficult and timely victory . The French had retreated from the field with a badly mauled division and Kutuzov had secured the right flank . Indeed , Francis was so pleased with the outcome at Dürenstein that he awarded Kutuzov the Military Order of Maria Theresa . + For the French , the survival of the Corps Mortier seemed nothing short of a miracle . The remainder of Gazan 's division crossed the river the next morning and eventually recuperated in Vienna , which the French acquired by deception later in the month . More importantly for them , the French force had performed well over difficult terrain and under terrible combat conditions . Initially there had been some panic and parts of at least one French battalion had tried to escape on the flotilla craft . They had lost control of the boats in the current and smashed into the pillars of the burned bridge at Krems , overturning their boats . Tossed into the icy river , most had drowned . Despite this initial panic , Gazan 's column retained its cohesion , and responded well to various difficult demands . Dupont had demonstrated his tactical acumen : when he heard cannon fire , he directed his troops toward it to support the French division . In terms of French staffing , Mortier 's failure to guard his flank , especially in the face of Napoleon 's direct advice , adversely influenced his relationship with his commander . However , in the immediate weeks ahead , the flamboyant Murat did more to annoy Napoleon than Mortier had . In assessing the battle and its aftermath , historians have laid the blame and credit for its outcome not only on Mortier and Gazan : " Napoleon , aware of Mortier 's danger and his own culpability for it , vented his frustration on Murat , whom he unjustly accused of abandoning Mortier for the empty glory of riding through Vienna . " + After the victory at Austerlitz , Napoleon dispersed the VIII Corps and reassigned Mortier . However disappointed he may have been with Mortier , Napoleon was pleased with Gazan 's performance . As recognition of his conduct in what the French called " the immortal Battle of Dürenstein " , Gazan received the Officer 's Grand Cross of the Legion of Honor . + The loss of Schmitt was a significant blow to the Austrian military organization . Called out of retirement for this specific task , he was one of their most experienced general staff officers , other than the Archduke Charles . From the summer of 1796 until his retirement in 1800 he had been Chief of the Quartermaster General Staff of the Army , the Lower Rhine , the Rhine and the Army of Germany . Furthermore , he was a trusted member of Archduke Charles ' staff . He had helped to design several of Charles ' more important victories at Emmendingen , Schliengen , the sieges at Kehl and Hünigen , the battles at Ostrach and Stockach , and the northern Swiss Campaign of 1799 that included battles at Winterthur and Zürich . An experienced officer and excellent tactician , he might well have made a more effective Chief of the Quartermaster General Staff of the Coalition Army at the Battle of Austerlitz than his eventual replacement , Franz von Weyrother . In Schmitt 's absence Weyrother , the architect of the Austrian catastrophe at Hohenlinden in 1800 , was chosen to develop the general battle plan of Coalition action at Austerlitz . Schmitt , undoubtedly a far better tactician than Weyrother , and possessed of superior training and mapping skills , would have developed a more realistic Coalition plan for Austerlitz . Schmitt 's presence would probably not have been enough to turn that defeat into a victory , but it would have mitigated the magnitude of the Coalition 's losses ; Austerlitz was considered one of Napoleon 's finest triumphs . + In the broader picture , despite the important major naval engagements , the outcome of the War of the Third Coalition was determined on the Continent , predominantly in the two major land operations . In the Ulm campaign , the Habsburgs achieved some minor victories , such as Klenau 's at Haslach @-@ Jungingen , but ultimately lost an entire army and an officer corps . The latter would not resume arms against France until formally exchanged . This condition crippled the Austrian military leadership and forced the recall of such pensioners as Schmitt out of retirement . After the capitulation at Ulm , isolated portions of the Austrian military evaded capture and joined with their Russian allies ; Michael von Kienmayer 's corps slipped out of the encirclement and joined Kutuzov 's force . A few other small forces refused to capitulate and seemingly melted into the Bavarian mountains and the Thurgingian forests , to reappear in Bohemia for Austerlitz . Sixteen hundred cavalry , including Archduke Ferdinand and Prince Schwarzenberg , broke out of Ulm before its capitulation . Maximilian , Count of Merveldt , led his column back through the mountains into Austria , fighting rear guard actions against pursuing French forces at the Steyer ( Steyr ) and Mariazell . These elusive units were insufficient to balance heavy losses at key battles in which the Austrians could not hold their own against the French . Between the Ulm capitulation and the Austrian and Russian defeat at Austerlitz , there were other minor achievements : a successful skirmish between the cavalry that escaped from Ulm and the French near the town of Nördlingen , the contested victory at Dürenstein , and another within days at Schöngrabern . + The second determining event , the decisive French victory at the Battle of Austerlitz over the combined Russian and Austrian armies , forced the Austrian withdrawal from the Coalition . The subsequent Peace of Pressburg , signed on 26 December 1805 , reinforced the earlier treaties of Campo Formio and Lunéville . Furthermore , Austria ceded land to Napoleon 's German allies , and paid an indemnity of 40 million francs . Victory at Austerlitz also gave Napoleon the latitude to create a buffer zone of German states between France and the states of Prussia , Russia , and Austria . These measures did not establish a lasting peace on the continent . Prussian worries about growing French influence in Central Europe sparked the War of the Fourth Coalition in 1806 , in which Austria did not participate . + + = = Battlefield commemorations = = + + Until 1805 , Dürenstein was probably best known as the village in which crusader Richard the Lionheart was held by Leopold V , Duke of Austria . In 1741 , during the War of the Austrian Succession , several hundred local villagers had held off the French and Bavarian armies , intent on capturing Vienna , by painting drain pipes to look like cannons , and beating on drums , thus suggesting the presence of a large force . + After 1805 , the exploits of 40 @,@ 000 French , Russian , and Austrian soldiers excited the European imagination . General Schmitt 's grave has never been found , but in 1811 a monument for him was erected at the Stein Tor , the gate leading from the old village of Krems to the hamlet of Stein . The house in which Captain von Stiebar lived was marked with a bronze plate commemorating his contribution to the battle . In 1840 , a Spanish lithographer created an image of the battle , which was later expanded by English lithographer John Outhwaite . The image depicts the evacuation of French troops via the Danube flotilla ( see Infobox image ) on a moonlit night . In fact , the moon was in its last quarter phase 48 hours later , and on 11 November probably did not provide as much light as depicted in the image . + In 1836 , Jean Antoine Siméon Fort ( French , 1793 – 1861 ) , a historical painter , created a watercolor of the battle , Combat de Dürnstein le 11 novembre 1805 ( ( English ) Battle of Dürenstein of 11 November 1805 ) , which is in the Trianon collection at Versailles . + In the Russian novel War and Peace , Leo Tolstoy devoted several pages to the battle , its prelude , and its aftermath , and the delivery of its news to the Tsar by Prince Andrew . Between Dürenstein and Rossatz , at the edge of the Loiben plain , stands the " Little Frenchman " memorial ( see image ) erected in 1905 to commemorate the battle ; it bears the names of Mortier , Gazan , Kutuzov , Schmitt , and others on a copper @-@ engraved plate . + + = = Orders of battle = = + + + = = = French VIII . Corps ( Corps Mortier ) = = = + + On 6 November , Édouard Adolphe Mortier commanded the following forces : + 1st Division under command of Pierre Dupont de l 'Étang ( formerly 1st Division of VI . Corps ) , six battalions , three squadrons , and three guns , most of which were involved in the fighting after mid @-@ day . + 2nd Division under command of Honoré Théodore Maxime Gazan de la Peyrière ( formerly 2nd Division of the V. Corps ) , nine battalions , three squadrons , three guns . + 3rd Division under command of Jean @-@ Baptiste Dumonceau ( Batavian Division , formerly 3rd Division of the II . Corps ) . The 3rd Division was not involved in the fighting . + Dragoon Division under command of Louis Klein . Klein 's division included the 1st , 2nd , 4th , and 14th Regiments of Dragoons . They were not involved in the fighting . + Danube fleet of fifty boats , under the command of Frigate Captain Lostange . + Total : fifteen battalions , six squadrons , six guns , approximately 12 @,@ 000 men , not all of which were involved in the fighting . + + = = = Coalition columns = = = + + First Column , commanded by General of Brigade Prince Pyotr Ivanovich Bagration , included three battalions of infantry , three grenadier battalions , and three Jäger battalions , ten squadrons of Hussars . + Second Column , Lieutenant General Essen , included six battalions of infantry , three battalions of grenadiers , and five squadrons of Hussars . + Third Column , commanded by Lieutenant General Dokhturov , including six battalions of infantry , one battalion from the 8th Jäger regiment , and ten squadrons of the Hussar Regiment Mariupol . + Fourth Column , commanded by Lieutenant General Schepelev , nine battalions of infantry . + Fifth Column , Lieutenant General Freiherr von Maltitz , nine battalions of infantry . + Sixth Column , Lieutenant General Freiherr von Rosen , with six battalions of Infantry and ten squadrons of cavalry . The Sixth Column did not take part in the fighting . + Austrian Infantry Brigade , Major General Johann Nepomuk von Nostitz @-@ Rieneck , four battalions of Border Infantry , including the highly decorated 9th Regiment Peterwardeiner . + Austrian Cavalry Division , Lieutenant Field Marshal Friedrich Karl Wilhelm , Fürst zu Hohenlohe , twenty @-@ two squadrons of cavalry . + Total : fifty @-@ eight battalions , sixty @-@ two squadrons , fourteen artillery batteries , approximately 24 @,@ 000 men and 168 guns . + + + = Brock Lesnar = + + Brock Edward Lesnar / ˈlɛznər / ( born July 12 , 1977 ) is an American Canadian professional wrestler , mixed martial artist , and former amateur wrestler and professional American football player . He is currently signed to WWE on the Raw brand . He is a four @-@ time WWE ( World Heavyweight ) Champion , a former UFC Heavyweight Champion , and an NCAA Division I Heavyweight Wrestling Champion . He is also a one @-@ time IWGP Heavyweight Champion , making him a five @-@ time world champion in professional wrestling . As of July 14 , 2016 , he is # 8 in official UFC heavyweight rankings . + After his successful amateur wrestling career at Bismarck State College and the University of Minnesota ( 106 wins and 5 losses ) , Lesnar signed with WWE ( then the World Wrestling Federation ) in 2000 . He was assigned to its developmental promotion Ohio Valley Wrestling ( OVW ) , where he was a three @-@ time OVW Southern Tag Team Champion with Shelton Benjamin . After debuting on WWE 's main roster in 2002 , he won the WWE Championship on three separate occasions with victories over The Rock and Kurt Angle ( twice ) . Lesnar won his first WWE Undisputed Championship five months after his main roster debut at the age of 25 , becoming the youngest champion in the title 's history . He was also the 2002 King of the Ring and the 2003 Royal Rumble winner , making him the youngest King of the Ring and Royal Rumble winner as well . Following his match with Goldberg at WrestleMania XX , Lesnar left the WWE and pursued a career in the National Football League ( NFL ) . He was named a defensive tackle for the Minnesota Vikings but was cut prior to the start of the 2004 – 05 season . In 2005 , Lesnar returned to professional wrestling and signed with New Japan Pro Wrestling ( NJPW ) , where he won the IWGP Heavyweight Championship in his first match . After a contractual dispute with NJPW , he also wrestled as IWGP Heavyweight Champion in the Inoki Genome Federation ( IGF ) . + In 2006 , Lesnar pursued a career in mixed martial arts . He signed with Hero 's and won his first fight , against Min @-@ Soo Kim , in June 2007 . He then signed with the Ultimate Fighting Championship ( UFC ) the following October . Lesnar lost in his UFC debut against Frank Mir and then won his second fight against Heath Herring . In November 2008 , Lesnar defeated Randy Couture to become the UFC Heavyweight Champion . Shortly after a successful title defense in a rematch with Mir , Lesnar was sidelined due to diverticulitis . He would return at UFC 116 to defeat Interim UFC Heavyweight Champion Shane Carwin and unify the heavyweight championships , becoming the Undisputed Heavyweight Champion . Lesnar then lost the championship to Cain Velasquez at UFC 121 . In 2011 , he was once again sidelined due to diverticulitis and underwent surgery . Lesnar returned at UFC 141 in December , losing to Alistair Overeem and promptly retiring from MMA . Lesnar was a box office sensation in UFC . He took part in a few of the best selling pay @-@ per @-@ views in UFC history , including UFC 100 and UFC 200 . + In April 2012 , Lesnar once again returned to professional wrestling , rejoining WWE after an eight @-@ year hiatus . Two years later , at WrestleMania XXX , Lesnar defeated The Undertaker to end his undefeated streak at the premier annual event . Lesnar has been managed by Paul Heyman throughout the majority of his professional wrestling career . He has headlined numerous pay @-@ per @-@ view events for both the WWE and UFC , including WrestleMania XIX , WrestleMania 31 , UFC 100 , and UFC 116 . In 2015 , an ESPN.com article referred to Lesnar as " the most accomplished athlete in pro wrestling history " . + + = = Early life = = + + Lesnar was born in Webster , South Dakota , on July 12 , 1977 . He was raised on a Webster dairy farm owned by his parents , Stephanie and Richard Lesnar . He is of German descent . He has two older brothers named Troy and Chad , and a younger sister named Brandi . At age 17 , he joined the National Guard , where he was assigned to an office job after his red @-@ green colorblindness was deemed hazardous to his desire to work with explosives . He lost this job after failing a computer typing test , and later worked for a construction company . + Lesnar attended Webster High School , where he played football and competed in amateur wrestling , placing third in the state championships his senior year . He then attended Bismarck State College , where he won the National Junior College Athletic Association ( NJCAA ) heavyweight wrestling championship in his sophomore year . He transferred to the University of Minnesota on a wrestling scholarship for his junior and senior college years . There , he was roommates with future WWE colleague Shelton Benjamin , who was also his assistant coach . + Lesnar won the 2000 National Collegiate Athletic Association ( NCAA ) Division I heavyweight wrestling championship his senior year after being the runner @-@ up to Stephen Neal the year prior . He finished his amateur career as a two @-@ time NJCAA All @-@ American , the 1998 NJCAA Heavyweight Champion , two @-@ time NCAA All @-@ American , two @-@ time Big Ten Conference Champion , and the 2000 NCAA Heavyweight Champion , with a record of 106 – 5 overall in four years of college . + + = = Professional wrestling career = = + + + = = = World Wrestling Federation / Entertainment = = = + + + = = = = Training and debut ( 2000 – 2002 ) = = = = + + In 2000 , Lesnar signed with the World Wrestling Federation ( WWF ) . He was sent to its developmental territory , Ohio Valley Wrestling . There , Lesnar first met future friend and manager Paul Heyman . He formed a tag team known as " The Minnesota Stretching Crew " with his former college roommate , Shelton Benjamin . Lesnar and Benjamin won the OVW Southern Tag Team Championship on three separate occasions . Lesnar wrestled several dark matches in 2001 and 2002 before being called up to the main roster . + Lesnar debuted on WWF television on the March 18 , 2002 , episode of Raw , coming through the crowd and attacking Al Snow , Maven and Spike Dudley during their match . He was accompanied by Paul Heyman , who was seen giving instructions to Lesnar . When the brand extension was introduced in the WWF , Lesnar was drafted to the Raw brand . Later , Heyman was confirmed to be Lesnar 's agent and gave Lesnar the nickname " The Next Big Thing " . Lesnar 's first feud was with the Hardy Boyz . Lesnar and Jeff Hardy squared off at Backlash in Lesnar 's first official televised match . He won the match by knockout after Hardy was unable to respond to referee Theodore Long . The next night on Raw , Lesnar faced off against Jeff Hardy 's brother , Matt Hardy , and defeated him in the same fashion . + + = = = = WWE Championship reigns ( 2002 – 2004 ) = = = = + + In June 2002 , Lesnar won the King of the Ring tournament , defeating Bubba Ray Dudley in the first round , Booker T in the quarter @-@ finals , Test in the semi @-@ finals , and Rob Van Dam in the finals , earning him a shot at the WWE Undisputed Championship at SummerSlam . On July 22 , Lesnar joined the SmackDown ! brand . After a quick feud with Hollywood Hulk Hogan in August 2002 , Lesnar defeated WWE Undisputed Champion , The Rock at SummerSlam to become the WWE Undisputed Champion and youngest WWE Champion at age 25 , a record previously held by The Rock . He also became the second fastest wrestler to win the WWE Championship since his debut ( 126 days ) behind only Ric Flair ( 113 days ) . At the time , the Undisputed WWE Championship was being defended on both brands , so Raw General Manager Eric Bischoff expected Lesnar to return to Raw the following night . However , SmackDown ! General Manager Stephanie McMahon announced that Lesnar was only required to defend the title on SmackDown ! , forcing Bischoff to institute a new championship for Raw ( the World Heavyweight Championship ) . The WWE Undisputed Championship was then renamed the WWE Championship . + Lesnar 's rapid rise to the top of WWE in 2002 led to a feud with The Undertaker , which involved a match at Unforgiven . The match ended in a double disqualification resulting in Lesnar retaining the title . Lesnar faced The Undertaker again , at No Mercy , this time in a Hell in a Cell match . Leading up to the match , in the storyline , Lesnar broke the Undertaker 's hand with a propane tank . Despite Heyman begging McMahon not to let The Undertaker use his cast as a weapon , the request was denied and the match went on as planned . In a match that saw both wrestlers and even Heyman covered in blood , it ended when Lesnar reversed an attempted Tombstone Piledriver into his finishing F @-@ 5 maneuver for the win . Six days after his Hell in a Cell match with The Undertaker , Lesnar successfully retained his WWE title in a handicap match with Heyman at the Rebellion pay @-@ per @-@ view against Edge . + Lesnar 's next opponent was Big Show . Heyman was convinced more than anyone that Lesnar could not win , trying to talk him out of defending the title . Lesnar refused and faced Big Show in Madison Square Garden at the Survivor Series pay @-@ per @-@ view . Towards the end of the match , Lesnar delivered an F @-@ 5 to Big Show , but when he went for the pin , Heyman pulled the referee out of the ring . This allowed Big Show to capitalize and proceeded to chokeslam Lesnar on a steel chair . Show went on to pin Lesnar and win the title . This loss was Lesnar 's first pinfall loss in WWE and led Lesnar to turn for the first time in his career into a fan favorite . Following Survivor Series , Heyman made it clear that Lesnar would not get a rematch , and had snuck a special clause saying so into his contract . In order to gain his revenge on Big Show and Heyman , Lesnar interfered in his first title defense , which came against Kurt Angle the next month at Armageddon . Lesnar hit the F @-@ 5 on the champion , which enabled Angle to pin him and win the title . On the following episode of SmackDown ! , however , Angle introduced Heyman as his manager and , despite promising Lesnar a title shot earlier in the evening , declared that Lesnar still would not get it . Lesnar was beaten down by Big Show and Angle after the main event , but would get his revenge after the show went off the air . He eventually knocked the Big Show out with a steel chair , leaving Angle alone with Lesnar . Lesnar then chased the champion out of the ring and resumed his assault that culminated when Lesnar used the F @-@ 5 to propel Angle 's right knee into the steel ringpost . As paramedics tended to a screaming Angle , Lesnar finished off the assault with a kneebreaker on the ringside barricade , breaking the champion 's leg in storyline . + With Angle temporarily put out of action , Lesnar 's rivalry with Heyman and the Big Show resumed , which culminated in a match at the Royal Rumble in January 2003 with the winner being placed into the Royal Rumble later in the evening . Lesnar would defeat Big Show and entered the Royal Rumble as the # 29 entry , the second to last competitor to enter the match . He eliminated Matt Hardy and The World 's Greatest Tag Team , which was composed of Charlie Haas and his former OVW teammate , Shelton Benjamin , who were mentored by Angle . Lesnar would then eliminate The Undertaker last and win the Royal Rumble , which guaranteed him a title match at WrestleMania XIX . + After the Royal Rumble , Lesnar and Chris Benoit faced off against Angle , Haas , and Benjamin at No Way Out the following month and Lesnar 's team won the match . During the match at WrestleMania , Lesnar botched a shooting star press , a move he 'd done numerous times in developmental matches , and jammed his head and neck . This stunned Lesnar and forced Angle and Lesnar to improvise the finish of the match . Lesnar would defeat Angle , after delivering an F @-@ 5 , to win his second WWE Championship . Lesnar was diagnosed with a legitimate concussion from the botched shooting star press . + After WrestleMania , Lesnar turned his attention to John Cena , who had returned from injury in February 2003 and who had been F @-@ 5'd into a ringpost in the same manner Angle had been . Cena claimed Lesnar nearly ended his career and even named his new finishing move the " F.U. " as a jab at the new champion . The feud ended in a match at Backlash when Lesnar defeated Cena . On the following episode of SmackDown ! , Lesnar returned to his rivalry with Big Show after he injured Rey Mysterio badly during their match at Backlash . Show 's attack resulted in Mysterio being carried out on a stretcher and back board and Big Show took Mysterio off the stretcher and swung the back board into the ringpost , compounding the injury . Lesnar called out the Big Show , who demanded that Lesnar put his title on the line against him . This led to a stretcher match at Judgment Day for the title . Lesnar successfully retained his title with help from Rey Mysterio and a forklift . During the scripted rivalry , on SmackDown , Lesnar lifted Big Show off the top @-@ rope in a superplex which caused the ring to collapse on impact . + As Lesnar and Big Show continued their rivalry , Kurt Angle returned from his neck surgery and he and Lesnar began to form a more friendly rivalry , as the two were allies yet contenders for the title . At the first ever SmackDown brand @-@ exclusive pay @-@ per @-@ view in July , Vengeance , Lesnar took on Angle and Big Show in a No Disqualification triple threat match for his title , which ended after Angle hit the Angle Slam on both Big Show and Lesnar , pinning the champion to become WWE Champion for a fourth time . + Lesnar continued to aggressively pursue the WWE title despite his friendship with Angle . Mr. McMahon found his way into the angle , at first berating Lesnar , who had involved himself in McMahon 's rivalry with Zach Gowen , for losing to Angle . This all turned out to be a swerve that came into focus on the August 7 , 2003 SmackDown ! in Kelowna , British Columbia . That night , Lesnar and McMahon were to face each other in a steel cage match with Angle as the special guest referee as per McMahon 's orders on the previous week 's program . During the match , Lesnar had passed out due to a staged backstage incident and McMahon was set to pin him , but Angle refused to allow McMahon to win that way . As the two men began to argue , Lesnar rose to his feet , revealing the ruse to the crowd , and F @-@ 5'd Angle . He then brutally beat Angle while McMahon watched , and celebrated with him afterwards , turning heel in the process . At SummerSlam , Lesnar lost to Angle when Angle made Lesnar tap out to the ankle lock . After that , Lesnar would cement his heel turn by brutalizing smaller wrestlers and attacking his rivals on a more consistent basis . He returned to using the F @-@ 5 to propel his opponents ' legs into the ringpost , as he did to Spanky and Gowen , and interfered in Angle 's matches on two separate occasions . On the September 18 , 2003 episode of SmackDown ! , Lesnar received his third shot at Angle in as many months when he faced the champion in a sixty @-@ minute Iron Man match for the title . Lesnar won the match and his third WWE Championship by a final count of five to four . + Lesnar successfully defended his newly won title against the debuting Paul London on October 9 edition of SmackDown ! . He returned to feuding with The Undertaker . Lesnar had previously cost Undertaker the title in a match against then @-@ champion Kurt Angle , which granted him a shot at Lesnar 's title . At No Mercy , Lesnar defeated Undertaker in a Biker Chain match . The rivalry then came to an end after The Undertaker chose to focus on Mr. McMahon . + After Paul Heyman returned to WWE as general manager of SmackDown ! , Lesnar aligned himself with his former manager . With Survivor Series coming up , Lesnar decided to challenge Angle to a traditional Survivor Series elimination tag team match . Lesnar chose Big Show as his first teammate , with Heyman adding a returning Nathan Jones and a debuting Matt Morgan to bring the team number to four . Angle chose Chris Benoit and The APA to join his team . However , Faarooq was injured during a match with Lesnar and Angle 's team was forced to find a replacement for him . Lesnar 's team picked A @-@ Train to fill the fifth and final spot for them after he attacked John Cena , who refused to accept an invitation to join Lesnar 's team . Cena instead joined Angle 's team , and Angle added Hardcore Holly as the fifth member ; Lesnar had injured Holly the year before and he hadn 't wrestled since . In the climax of the match , Chris Benoit became the only second wrestler to make Lesnar tap out . Lesnar faced Benoit in a singles bout two weeks later for the WWE Championship on SmackDown ! , where Lesnar won after Benoit passed out to Lesnar 's debuting Brock Lock submission hold . + The Survivor Series event marked the first time Lesnar met Goldberg from the Raw brand . After Lesnar claimed in a backstage interview that he could beat anybody in the world , Goldberg interrupted the interview and introduced himself to Lesnar , shaking hands with him before leaving with a staredown . Lesnar followed this rivalry with a feud involving Hardcore Holly . In the storyline , Holly wanted revenge on Lesnar for legitimately injuring his neck during a previous match between the two in 2002 which left Holly in need of neck surgery and out of action for a year . At the Royal Rumble in 2004 , Lesnar defeated Holly to retain the WWE Championship . Later in the Royal Rumble match , Lesnar attacked Goldberg and delivered an F @-@ 5 , enabling Kurt Angle to eliminate him . + + = = = = Final storylines and departure ( 2004 ) = = = = + + In February , Lesnar faced Eddie Guerrero for the WWE title at No Way Out . Late in the match , Goldberg delivered a spear to Lesnar while the ref was unconscious . Afterwards , Guerrero went to pin Lesnar but Lesnar kicked out at two . Lesnar then attempted to F @-@ 5 Guerrero but Guerrero reversed it into a DDT . Guerrero then hit a frog splash ; pinning Lesnar to win the WWE Championship . An angry Lesnar then began feuding with Goldberg , blaming him for losing his title , and a match was set up between the two at WrestleMania XX . During the feud with Goldberg , Lesnar was also at odds with Stone Cold Steve Austin , who was shown suggesting to Goldberg that he attack Lesnar at No Way Out . After Lesnar attacked Austin on Raw and stole his four @-@ wheeler , Austin was inserted as the special guest referee for the WrestleMania match . On the March 4 episode of SmackDown , Lesnar defeated Hardcore Holly in his last match on a weekly WWE televised show . Behind the scenes , it was widely known that the match was Goldberg 's last in WWE . Only a week before WrestleMania , rumors surfaced that Lesnar , too , was leaving to pursue a career in the National Football League ( NFL ) . As a result , Lesnar 's match with Goldberg became a fiasco as the fans at Madison Square Garden jeered and heckled both wrestlers vociferously . Goldberg gained victory after delivering a Jackhammer to Lesnar and both men subsequently received Stone Cold Stunners from Austin . + + = = = New Japan Pro Wrestling ( 2005 – 2007 ) = = = + + On October 8 , 2005 , Lesnar won the IWGP Heavyweight Championship on his debut match in a three @-@ way match with Kazuyuki Fujita and Masahiro Chono at a New Japan Pro Wrestling ( NJPW ) show in the Tokyo Dome . Lesnar is one of the few American wrestlers to have held this title . He won the match by pinning Masahiro Chono after an F @-@ 5 , which he had renamed the Verdict since WWE owns the trademark on the F @-@ 5 name . After the match , Lesnar stated that this name was referring to his lawsuit against WWE . + On December 6 , WWE filed a motion for a temporary restraining order to prevent Lesnar from continuing to work with NJPW , but the court did not grant it . Following that , he had two non @-@ title victories against Manabu Nakanishi and Yuji Nagata . Lesnar successfully defended his championship on January 4 , 2006 , against former champion Shinsuke Nakamura . On January 13 , WWE once again filed an injunction against Lesnar to stop him from defending the IWGP Heavyweight Championship which was also not enforced as he went on to retain his championship against former Sumo Wrestling Grand Champion Akebono on March 19 , at the Sumo Hall . Lesnar had another successful title defense against Giant Bernard on May 3 , 2006 . This was the first American vs. American title match in NJPW since Vader vs. Stan Hansen in 1990 . + On July 15 , 2006 , New Japan Pro Wrestling announced Lesnar would not return to defend the IWGP Heavyweight Championship due to " visa issues " and had been stripped of the title . A tournament was held on July 16 to determine the new champion which was won by Hiroshi Tanahashi . Lesnar continued to possess the physical IWGP Championship belt until late June 2007 . + Approximately one year later on June 29 , 2007 , Lesnar defended his IWGP Heavyweight Championship against TNA World Heavyweight Champion Kurt Angle in a champion versus champion match . Inoki Genome Federation promoter Antonio Inoki had stated Lesnar was the " proper " IWGP Heavyweight Champion as he was not defeated for the title . Angle would defeat Lesnar by forcing him to tap out to the Angle lock to win the IWGP Heavyweight Championship as recognized by IGF and Total Nonstop Action Wrestling ( TNA ) . This was Lesnar 's last match as a professional wrestler until 2012 , when he re @-@ signed with WWE . + + = = = = Lawsuit = = = = + + Lesnar had previously signed a non @-@ compete clause in order to be released from his contract with WWE , which prohibited him from working for any other professional wrestling companies before June 2010 . However , he decided to challenge this ruling in court . WWE responded with a counterclaim after Lesnar breached the agreement by appearing at a New Japan Pro Wrestling show in 2004 . In July 2005 , the two sides dropped their claims and entered negotiations to renew their relationship . WWE had offered Lesnar a contract , but on August 2 , 2005 , WWE 's official website reported that Lesnar had withdrawn from any involvement with the company . The lawsuit began to enter settlement talks on September 21 , but did not get solved . + On January 14 , 2006 , Judge Christopher Droney stated that unless WWE gave him a good argument between then and the 25th , he would rule in favor of Lesnar , giving him a summary judgment . This would have enabled Lesnar to work anywhere , immediately . WWE was later granted a deadline postponement . On April 24 , WWE announced on WWE.com that both parties had reached a settlement . On June 12 , a federal judge dismissed the case at the request of both legal parties . + + = = = Return to WWE = = = + + + = = = = Feud with Triple H and ending the Streak ( 2012 – 2014 ) = = = = + + Lesnar returned to the WWE on April 2 , 2012 , on Raw , as a heel by confronting and delivering an F @-@ 5 to John Cena . The following week on Raw , general manager John Laurinaitis revealed that he signed Lesnar to bring " legitimacy " back to the WWE and become the " new face of the WWE " . Laurinaitis also announced that Lesnar would face Cena at Extreme Rules with the Extreme Rules stipulation later added to the match . Lesnar was dominant throughout the match until Cena punched Lesnar in the face with a steel chain wrapped around his fist . Cena then delivered the Attitude Adjustment to Lesnar onto steel steps and Lesnar lost the match . + The following night on Raw , WWE 's Chief Operating Officer Triple H refused to give in to Lesnar 's unreasonable contract demands ( which included being given his own personal jet and having Raw renamed to Monday Night Raw Starring Brock Lesnar ) , resulting in Lesnar attacking him and breaking his arm with a Kimura lock in storyline . The next week on Raw , Paul Heyman made his return as Lesnar 's legal representative and claimed that Lesnar was quitting WWE . He later announced a lawsuit against WWE for breach of contract . At No Way Out in June , Triple H challenged Lesnar , who was not present , to a match at SummerSlam which Lesnar refused . Stephanie McMahon would later goad Heyman into accepting the match on Lesnar 's behalf on July 23 at Raw 1000 . On August 19 , at SummerSlam , Lesnar defeated Triple H in a No Disqualification match via submission after once again breaking his arm in storyline . The following night on Raw , Lesnar declared himself the new " King of Kings " and said that he would depart from WWE after his victory over Triple H , stating that he had conquered everything in the company . + Lesnar returned on the January 28 , 2013 episode of Raw , confronting Vince McMahon who was about to fire Heyman , and Despite Heyman 's pleas , Lesnar hit McMahon with an F @-@ 5 , breaking McMahon 's pelvis in storyline . The following week , during The Miz 's MizTV talk show , Raw Managing Supervisor Vickie Guerrero revealed herself as the one who signed Lesnar to a new contract to impress McMahon . On the February 25 episode of Raw , Lesnar once again attempted to attack McMahon , only to get into a brawl with the returning Triple H , which resulted in Lesnar legitimately having his head split open and requiring eighteen stitches . The following week , Triple H issued a challenge to Lesnar , requesting a rematch with him at WrestleMania 29 , which Lesnar accepted but only after Triple signed a contract and Lesnar named the stipulation . The following week , after Triple H signed the contract and assaulted Heyman , the stipulation was revealed as No Holds Barred with Triple H 's career on the line . Lesnar ended up losing the match after Triple H hit him with a Pedigree onto the steel steps . On the April 15 episode of Raw , Lesnar attacked 3MB ( Heath Slater , Drew McIntyre , and Jinder Mahal ) before Heyman challenged Triple H to face Lesnar in a steel cage match at Extreme Rules , which Triple H accepted the following week . On May 19 at the pay @-@ per @-@ view , after interference from Heyman , he defeated Triple H and ended their feud . Lesnar returned on the June 17 episode of Raw , attacking Heyman 's fellow client CM Punk with an F @-@ 5 . Despite the accusations from Punk , Heyman claimed that he was not behind Lesnar 's attack on him . However , in July , Heyman turned on Punk , and claimed that Punk could not beat Lesnar , which led to Lesnar making his return and attacking Punk on the July 15 episode of Raw . The following week on Raw , Punk challenged Lesnar to a match at SummerSlam , where Lesnar defeated Punk in a no disqualification match . + On the December 30 episode of Raw , Lesnar returned with Heyman to announce his intentions to challenge the winner of the upcoming WWE World Heavyweight Championship match between Randy Orton and John Cena at the Royal Rumble . Lesnar then dared any wrestler who disapproved of that notion to challenge him , which was answered by Mark Henry , and a brawl would ensue , ending with Lesnar delivering an F @-@ 5 to Henry . The following week on Raw , Henry challenged Lesnar again , only to have Lesnar dislocate his elbow with the Kimura lock in storyline , and this led Big Show to came out afterwards to confront Lesnar , thus starting a feud which was settled at Royal Rumble , where Lesnar defeated the Big Show after attacking him with a steel chair before the match began . On the February 24 , 2014 episode of Raw , after Heyman stated that Lesnar had requested a match for the WWE World Heavyweight Championship at WrestleMania XXX , instead receiving an open contract to face anyone else of his choosing , The Undertaker then returned and chokeslammed Lesnar through a table , setting up their match at WrestleMania . Lesnar defeated Undertaker after executing three F @-@ 5s , ending his undefeated WrestleMania streak at 21 , a feat that was described by Sports Illustrated as being " the most shocking result since the Montreal Screwjob " . + + = = = = WWE World Heavyweight Champion ( 2014 – 2015 ) = = = = + + On the July 21 , 2014 episode of Raw , Triple H announced that Lesnar would face John Cena at SummerSlam for the WWE World Heavyweight Championship . At SummerSlam , Lesnar defeated Cena to become the WWE World Heavyweight Champion , and during the match he delivered sixteen suplexes ( most of which were German suplexes ) and two F @-@ 5s to Cena , who barely managed any offense . On the August 19 episode of Main Event , Triple H announced that Cena was invoking his championship rematch clause against Lesnar at Night of Champions , where Lesnar was disqualified due to Seth Rollins interfering , but retained his championship , which could not be lost via disqualification . Later in the year , after Rollins reunited with The Authority , he was added to Lesnar and Cena 's championship match at Royal Rumble , making it a triple threat match , which Lesnar won despite suffering a storyline broken rib during the match . + Lesnar 's next challenger was Roman Reigns , who won the Royal Rumble match to earn the right to face Lesnar for the title at WrestleMania 31 . During his main @-@ event match against Reigns , Lesnar delivered multiple suplexes and was heard exclaiming , " Suplex City , bitch ! " , and thereafter " Suplex City " became one of his signature catchphrases and merchandise motifs . After Lesnar and Reigns traded a few false finishes , Rollins cashed in his Money in the Bank contract while the match was in progress , making it a triple threat ; Rollins then pinned Reigns to win the title . The following night on Raw , Lesnar tried to invoke his rematch clause and subsequently attacked commentators Booker T , John " Bradshaw " Layfield , and Michael Cole , as well as a cameraman after Rollins refused the rematch , which led to Stephanie McMahon suspending Lesnar indefinitely in storyline . + Lesnar returned on the June 15 episode of Raw as a fan favorite , being chosen by The Authority as the number one contender to Rollins ' WWE World Heavyweight Championship at Battleground . On July 4 , Lesnar made his first non @-@ televised wrestling appearance for WWE since his 2012 return , defeating Kofi Kingston at The Beast in the East live event in Tokyo in a quick winning effort ; he also delivered F @-@ 5s to Kingston 's New Day stablemates Big E and Xavier Woods after the match . On July 19 at Battleground , Lesnar dominated Rollins , delivering 13 suplexes ; mid @-@ pinfall , after performing an F @-@ 5 , he was attacked by The Undertaker , who incapacitated Lesnar with a chokeslam and two Tombstone Piledrivers ; this ended the match , with Lesnar winning by disqualification and Rollins retaining the championship . + + = = = = Various feuds and storylines ( 2015 – present ) = = = = + + The following night on Raw , Undertaker explained that he had attacked Lesnar not for ending his WrestleMania streak , but rather for Lesnar allowing Heyman to constantly taunt Undertaker about it ; this led to the two brawling throughout the arena and a WrestleMania rematch being scheduled for SummerSlam on August 23 , where Undertaker would controversially defeat Lesnar ; during the match , Undertaker tapped out to a Kimura lock by Lesnar and the timekeeper rang the bell , but the referee did not see the tapout and demanded that the match continue , which saw Undertaker then hitting Lesnar with a low blow and Lesnar passed out to Undertaker 's submission hold , Hell 's Gate . The following night on Raw , Lesnar and Heyman challenged Undertaker to an immediate rematch , only to be confronted by Bo Dallas , who mocked Lesnar about his defeat ; Lesnar then responded by delivering 3 German suplexes and an F @-@ 5 to Dallas . + During Night of Champions , it was announced that Lesnar would face The Undertaker at the Hell in a Cell pay @-@ per @-@ view , where Lesnar defeated The Undertaker after a low blow and F @-@ 5 onto the exposed ring floor , ending their feud . The match was later voted " Match of the Year " during the 2015 Slammy Awards . On the January 11 episode of Raw , Lesnar returned , attacking The New Day , The League of Nations ( Sheamus , King Barrett , Rusev , and Alberto Del Rio ) , and Kevin Owens , before giving an F @-@ 5 to Roman Reigns . The following week on Raw , Lesnar would brawl with Reigns until they were both attacked by The Wyatt Family . At the Royal Rumble , Lesnar was the 23rd entrant , eliminating four competitors before being eliminated by Bray Wyatt with help from the rest of The Wyatt Family . + On the January 25 episode of Raw , Stephanie McMahon announced that the main event of Fastlane would be a triple threat match between Lesnar , Roman Reigns , and Dean Ambrose to determine who would face Triple H for the WWE World Heavyweight Championship at WrestleMania 32 . In the following weeks , Lesnar would be continuously provoked by Ambrose , with Reigns saving him from the subsequent attacks by Lesnar . At Fastlane , Lesnar dominated most of the match before he was put through two announce tables by Ambrose and Reigns ; he would ultimately lose the match after Reigns pinned Ambrose . Because of this , Lesnar attacked Ambrose in the parking lot as he was arriving at the arena for Raw , but Ambrose would return later in the night , having hijacked an ambulance , and he challenged Lesnar to a No Holds Barred Street Fight match at WrestleMania 32 , where Lesnar defeated Ambrose . On the July 7 edition of SmackDown , it was announced that Lesnar would be facing the returning Randy Orton at SummerSlam . On July 19 , at the 2016 WWE Draft , Lesnar was drafted to Raw . + + = = Football career = = + + After his match at WrestleMania XX , Lesnar sidelined his career in WWE to pursue a career in the National Football League ( NFL ) despite not playing American football since high school . The WWE issued this statement on their official website , WWE.com , following his departure : + Brock Lesnar has made a personal decision to put his WWE career on hold to prepare to tryout for the National Football League this season . Brock has wrestled his entire professional career in the WWE and we are proud of his accomplishments and wish him the best in his new endeavor . + Lesnar later told a Minnesota radio show that he had " three wonderful years " in WWE , but had grown unhappy and always wanted to play pro football , adding that he did not want to be 40 years old and wondering if he could have " made it " in football . In an interview about the NFL , he stated : + This is no load of bull ; it 's no WWE stunt . I am dead serious about this . I ain 't afraid of anything and I ain 't afraid of anybody . I 've been an underdog in athletics since I was five . I got zero college offers for wrestling . Now people say I can 't play football , that it 's a joke . I say I can . I 'm as good an athlete as a lot of guys in the NFL , if not better . I 've always had to fight for everything . I wasn 't the best technician in amateur wrestling but I was strong , had great conditioning , and a hard head . Nobody could break me . As long as I have that , I don 't give a damn what anybody else thinks . + Lesnar had a great showing at the NFL Combine . On April 17 , 2004 , a minivan collided with his motorbike ; he suffered a broken jaw and left hand , a bruised pelvis , and a pulled groin . Several NFL teams expressed interest in watching Lesnar work out . The Minnesota Vikings worked out Lesnar on June 11 , 2004 but he was hampered by the groin injury suffered in the April motorcycle accident . On July 24 it was reported that he was nearly recovered from his groin injury . He signed with the Vikings on July 27 and played in several preseason games for the team . He was released by the Vikings on August 30 , 2004 . Lesnar received an invitation to play as a representative for the Vikings in NFL Europa but declined due to his desire to stay in the United States with his family . He had several football cards produced of him during his time with the Vikings . + + = = Mixed martial arts career = = + + + = = = Hero 's ( 2007 ) = = = + + On April 29 , 2006 , after the final match of the K @-@ 1 World Grand Prix 2006 in Las Vegas , Lesnar announced his intent to join K @-@ 1 's mixed martial arts league , Hero 's . He trained with Minnesota Martial Arts Academy under Greg Nelson and Minnesota Assistant Head wrestling coach Marty Morgan . Lesnar announced on August 12 in Las Vegas that he had signed a deal with the K @-@ 1 promotion . His first fight was scheduled against Choi Hong @-@ man of Korea on June 2 , 2007 , at the K @-@ 1 Dynamite ! ! USA show . However , prior to the match , Hong @-@ Man was replaced by Min Soo Kim . Lesnar submitted Soo Kim with strikes in 1 : 09 of the first round to win his first official MMA match . + + = = = Ultimate Fighting Championship ( 2008 – 2011 ) = = = + + During UFC 77 , it was announced that Lesnar had reached a deal to fight with the Ultimate Fighting Championship ( UFC ) . On February 2 , 2008 , Lesnar made his debut with the promotion in an event titled UFC 81 : Breaking Point against former UFC Heavyweight Champion , Frank Mir . Due to his large hands , Lesnar was wearing 4XL gloves for the fight , making him the second man in Nevada 's combat sports history to wear such gloves after Choi Hong @-@ man . Lesnar secured an early takedown and began landing numerous punches but was docked a point after a punch hit Mir on the back of the head . Following another takedown by Lesnar , Mir managed to secure a kneebar and force a submission at 1 : 30 of the first round and Lesnar lost in his UFC debut . At UFC 82 , former UFC Heavyweight Champion and Hall of Famer Mark Coleman was announced to fight Lesnar at UFC 87 . However , Coleman withdrew from the fight due to an injury and was replaced by Heath Herring . Lesnar defeated Herring by unanimous decision . + Lesnar would then face Randy Couture for the UFC Heavyweight Championship at UFC 91 on November 15 . Lesnar would beat Couture via a technical knockout ( TKO ) in the second round to become the new UFC Heavyweight Champion . + On December 27 , 2008 , at UFC 92 , Frank Mir defeated Antônio Nogueira for the Interim Heavyweight Championship and was to face Lesnar for the Undisputed UFC Heavyweight Championship at UFC 98 . Immediately after winning the Interim Heavyweight title , Mir found Lesnar in the crowd and shouted , " You 've got my belt " . Due to a knee injury to Mir , the title unification match with Lesnar that was originally slated to be the UFC 98 main event was postponed . Lesnar won the postponed rematch with Mir at UFC 100 on July 11 , 2009 , via technical knockout in the second round . The win earned Lesnar Beatdown of the Year honors , with Anderson Silva , from Sherdog for 2009 . During his post @-@ match celebration , Lesnar flipped off the crowd who had been booing him . Lesnar also made a disparaging comment about the pay @-@ per @-@ view 's primary sponsor Bud Light , claiming they " won 't pay me nothin ' " and promoted Coors Light instead . Lesnar later apologized for his remarks at the post @-@ fight press conference , where he held a bottle of Bud Light and endorsed their product . + On July 1 , 2009 , it was reported that the winner of the Shane Carwin vs. Cain Velasquez fight at UFC 104 would face Lesnar but the match was scrapped and Lesnar was scheduled to defend his belt against Shane Carwin at UFC 106 on November 21 . On October 26 , 2009 , it was announced that Lesnar pulled out of his Carwin bout due to an illness . UFC President Dana White said Lesnar had been ill for three weeks , claiming he had never been this sick in his life and that it would take him a while to recover ; his fight with Carwin was rescheduled for UFC 108 on January 2 , 2010 . Lesnar initially sought treatment in Canada , but later told reporters that he had received " Third World treatment " at a hospital in Brandon , Manitoba , and that seeking better treatment in the U.S. saved his life . Lesnar went on to criticize Canadian health care further and stated that he shared his experience in an effort to speak " on the behalf of the doctors in the United States that don 't want health care reform to happen " . + On November 4 , it was confirmed that Lesnar was suffering from mononucleosis and that his bout with Carwin would have to wait a bit longer and the fight for Lesnar 's heavyweight championship was cancelled . On November 14 , at the UFC 105 post @-@ fight conference , Dana White stated , " [ Lesnar ] ' s not well and he 's not going to be getting well anytime soon " and that an interim title match might need to be set up . In addition to mononucleosis , it was revealed that he was suffering from a serious case of diverticulitis , an intestinal disorder , which required surgery . After further diagnosis , Lesnar underwent surgery on November 16 to close a perforation in his intestine that had been leaking fecal matter into his abdomen , causing pain , abscesses , and overtaxing his immune system to the point that he contracted mononucleosis . From the level of damage to Lesnar 's system , the surgeon estimated that the intestinal condition had been ongoing for around a year . + In January 2010 , Lesnar announced on ESPN 's SportsCenter that he was scheduled to make a return to the UFC in the summer . A match between Frank Mir and Shane Carwin took place on March 27 at UFC 111 to determine the Interim Heavyweight Champion , and Lesnar 's next opponent . Shane Carwin defeated Mir via knockout in the first round , becoming the new Interim Champion . After the fight , Lesnar came into the ring and stated , " It was a good fight but he 's wearing a belt that 's a make @-@ believe belt . I 've got the real championship belt " . Lesnar faced Carwin at UFC 116 to unify the heavyweight titles . Early in the first round , Carwin knocked Lesnar down with punch , gave him a cut above his left eye , and used a ground and pound attack the rest of the round . In the next round , Lesnar was able to take Carwin down , attain a full mount , then move into side @-@ control and finish the fight with an arm triangle choke . With the victory , Lesnar became the Undisputed UFC Heavyweight Champion , earning his first Submission of the Night and giving Carwin his first loss . The win also tied a UFC record for most consecutive successful Heavyweight Championship defenses . + Lesnar 's next defense was against undefeated top contender Cain Velasquez on October 23 , at the Honda Center in Anaheim , California at UFC 121 . Dana White announced via SportsNation that the UFC would bring back UFC Primetime to hype the fight . He was defeated by Velasquez for the title by TKO in the first round . + On January 11 , 2011 , Lesnar was announced as a coach of The Ultimate Fighter Season 13 , opposite Junior dos Santos , with the two expected to fight on June 11 at UFC 131 , however , he was struck with another bout of diverticulitis and had to withdraw from the fight on May 12 . He was replaced by Shane Carwin , who ended up losing against dos Santos . Lesnar underwent surgery on May 27 to help battle his problems with diverticulitis . Dana White said that he had a 12 @-@ inch piece of his colon removed . + In its May 2011 issue , ESPN 's magazine published a story listing the highest paid athlete based on base salary and earnings for the most recent calendar year or most recent season in 30 sports . Lesnar topped the list for MMA at $ 5 @.@ 3 million , which included his reported bout salaries and estimated pay @-@ per @-@ view bonuses . + In the summer of 2011 , Lesnar announced that he was returning to action , stating , " I feel like a new man . Healthy . Strong . I feel like I used to feel " . His return match was scheduled to be at UFC 141 on December 30 in Las Vegas against former Strikeforce heavyweight champion Alistair Overeem . Overeem won the fight by way of technical knockout in the first round . The result of the fight remains controversial , as Overeem tested positive for elevated levels of testosterone prior to his next fight . Lesnar then announced his retirement from MMA , mentioning his struggles with diverticulitis and saying " tonight was the last time you 'll see me in the octagon " . + Speculation about a return to MMA lasted until March 24 , 2015 , when Lesnar announced in an interview on SportsCenter that he had re @-@ signed with WWE and officially closed the door on a return to MMA , even though he was offered a deal " ten times more " than what he had made previously in his MMA career . He further elaborated that , while he was training for months for a return to the UFC , he felt " physically great but something was lacking mentally " . Lesnar added that " [ he 's ] an older caveman now , so [ he ] makes smarter caveman decisions " and that he chose to sign with WWE instead of returning to MMA because he could " work part @-@ time with full @-@ time pay " . + + = = = UFC 200 ( 2016 ) = = = + + Though Lesnar said he was " closing the door on MMA " in March 2015 , UFC announced on June 4 , 2016 , that he would return at UFC 200 on July 9 . WWE confirmed it had granted Lesnar " a one @-@ off opportunity " to compete at UFC 200 before he would return to the company for SummerSlam on August 21 . + Lesnar , representing Canada for the first time in his career , defeated Mark Hunt by unanimous decision ( 29 @-@ 27 ) , avoiding 19 of Hunt 's 30 attempted standing strikes , and taking him down four times to land 43 significant ground strikes , 32 in the final round . He also won a UFC @-@ record $ 2 @.@ 5 million purse . + On July 15 , Lesnar was notified of a potential anti @-@ doping policy violation by the United States Anti @-@ Doping Agency ( USADA ) stemming from an undisclosed banned substance in an out @-@ of @-@ competition sample collected on June 28 . Shortly after this was announced , WWE said Lesnar 's match with Randy Orton at SummerSlam would still take place . Hunt told UFC to either pay him half of Lesnar 's purse or let him out of his contract . He later changed his mind on Twitter , and asked for the whole purse . UFC has not yet publicly responded to Hunt . Lesnar told the Associated Press , " We will get to the bottom of this . " On July 19 , the UFC announced that a second sample taken in @-@ competition on July 9 tested positive for the same banned substance discovered in the previous out @-@ of @-@ competition sample . + + = = = UFC pay @-@ per @-@ views = = = + + + = = Personal life = = + + Lesnar married Rena Greek , better known as Sable , on May 6 , 2006 . They reside on a farm in Maryfield , Saskatchewan , having previously lived in Maple Plain , Minnesota . They have two sons : Turk ( born June 3 , 2009 ) and Duke ( born July 21 , 2010 ) . Lesnar also has twins with his former fiancée , Nicole McClain , the first being a son named Luke ( born Brock Jr . ) and the second a daughter named Mya Lynn ( born April 10 , 2002 ) . Mya is 10 minutes older than Luke . Lesnar has full custody of the twins , and is the stepfather of Mariah , Greek 's daughter with her late husband . + Lesnar is a conservative and a supporter of the Republican Party . He is a member of the National Rifle Association , and made an appearance at their annual meeting in May 2011 to discuss his passion for hunting and his role as a spokesman for Fusion Ammunition . + During his first run in WWE , Lesnar developed addictions to both alcohol and painkillers , allegedly drinking a bottle of vodka per day and taking hundreds of Vicodin pills per month to manage the pain caused by wear and tear on his body ; he named his accident at WrestleMania XIX as a particular source of pain . Lesnar claims that , as a result of his addiction and mental exhaustion , he does not remember " an entire two years " of his WWE career . + In January 2001 , Lesnar was arrested by police in Louisville , Kentucky for suspicion of possessing large amounts of anabolic steroids . The charges were dropped when it was discovered that the substances were a legal growth hormone . His lawyer described it as a " vitamin type of thing " . + On December 15 , 2011 , Lesnar was charged with hunting infractions on a trip to Alberta on November 19 , 2010 . Two charges were dropped , but Lesnar pleaded guilty to the charge of improper tagging of an animal . He was fined $ 1 @,@ 725 and given a six @-@ month hunting suspension . + As of July 2016 , Lesnar 's eldest son , Brock Jr. is ranked # 1 in Saskatchewan and # 4 in all of Canada in amateur wrestling . + + = = Other media = = + + Lesnar appears in the video games WWE SmackDown ! Shut Your Mouth , WWE SmackDown ! Here Comes the Pain , Madden NFL 06 , UFC 2009 Undisputed , UFC Undisputed 2010 , WWE ' 12 , WWE ' 13 , WWE 2K14 , WWE 2K15 , WWE 2K16 , and WWE 2K17 . + In 2003 , WWE Home Video released a DVD chronicling Lesnar 's career entitled Brock Lesnar : Here Comes the Pain . It was re @-@ released in 2012 as a three @-@ disc DVD and two @-@ disc Blu @-@ ray collector 's edition to tie in with Lesnar 's WWE return . It was also expanded to include new matches and interviews . + Lesnar was featured on the covers of Flex and Muscle & Fitness magazine in 2004 , and Minneapolis ' City Pages in 2008 . + In 2009 , Lesnar signed an endorsement deal with Dymatize Nutrition . A CD containing footage of Lesnar training was included with Dymatize 's " Xpand " product . + In 2011 , Lesnar published an autobiography titled Death Clutch : My Story of Determination , Domination , and Survival ( ISBN 978 @-@ 0062023117 ) . It was co @-@ written with Paul Heyman . + In a 2013 post on his blog , Attack on Titan author Hajime Isayama revealed that he drew inspiration from Lesnar for the character of the Armored Titan . + Lesnar has also appeared in multiple comedic Instagram and Vine videos by actor Eric Stonestreet . + + = = Filmography = = + + + = = = Television = = = + + + = = = Video games = = = + + + = = In wrestling = = + + Finishing moves + Brock Lock ( Over @-@ the @-@ shoulder single leg Boston crab ) – 2002 – 2004 + F @-@ 5 ( WWE ) / Verdict ( NJPW / IGF ) ( Fireman 's carry facebuster ) – 2002 – 2006 ; 2012 – present + Kimura lock – 2012 – present + Shooting star press – OVW ; only used once in WWE + Signature moves + Backbreaker + Fallaway slam + Gorilla press slam + Knee lifts to the opponent 's midsection + Multiple suplex variations + Belly @-@ to @-@ back , sometimes to two opponents at once + Fisherman 's , sometimes while delaying + Overhead belly @-@ to @-@ belly , sometimes into or out of the ring + Release / Rolling German + Snap + Vertical + Multiple turnbuckle thrusts + Powerslam + Rear naked choke + Running powerbomb + Standing double leg takedown followed by mounted punches + Triple non @-@ release powerbomb + Managers + Mr. McMahon + Sable + Paul Heyman + Nicknames + " The Anomaly " + " The Beast ( Incarnate ) " + " The Conqueror " + " The Freak " + " The Next Big Thing " + " The One in 21 – 1 / 22 @-@ 1 " + " The Nightmare of Suplex City " + Entrance themes + Ultimate Fighting Championship + " Enter Sandman " by Metallica + " Nickel Size Hail ( And the Damaging Winds ) " by Sunny Ledfurd + World Wrestling Entertainment / WWE + " Enforcer " by Jim Johnston ( April 8 , 2002 – June 3 , 2002 ) + " Next Big Thing " by Jim Johnston ( June 10 , 2002 – March 14 , 2004 ; April 2 , 2012 – August 20 , 2012 ) + " Next Big Thing ( Remix ) " by Jim Johnston ( January 28 , 2013 – present ) + + = = Mixed martial arts record = = + + + = = Championships , awards , and honors = = + + + = = = Collegiate wrestling = = = + + National Collegiate Athletic Association + NCAA Division I All @-@ American ( 1999 , 2000 ) + NCAA Division I Heavyweight Champion ( 2000 ) + Big Ten Conference Champion ( 1999 , 2000 ) + National Junior College Athletic Association + NJCAA All @-@ American ( 1997 , 1998 ) + NJCAA Heavyweight Champion ( 1998 ) + North Dakota State University Bison Tournament Champion ( 1997 – 1999 ) + + = = = Mixed martial arts = = = + + Inside Fights + Biggest Draw ( 2008 ) + Rookie of the Year ( 2008 ) + Sherdog Awards + Beatdown of the Year ( 2009 ) + Sports Illustrated + Top Newcomer of the Year ( 2008 ) + Ultimate Fighting Championship + UFC Heavyweight Championship ( 1 time ) + Submission of the Night ( 1 time ) + World MMA Awards + Breakthrough Fighter of the Year ( 2009 ) + Wrestling Observer Newsletter + Best Box Office Draw ( 2008 – 2010 ) + MMA Most Valuable Fighter ( 2008 – 2010 ) + + = = = Professional wrestling = = = + + Guinness World Records + World record : Youngest person to win the WWE Championship ( aged 25 years , 44 days ) + Inoki Genome Federation + IWGP Heavyweight Championship ( 1 time ) 1 + New Japan Pro Wrestling + IWGP Heavyweight Championship ( 1 time ) 1 + Ohio Valley Wrestling + OVW Southern Tag Team Championship ( 3 times ) – with Shelton Benjamin + Pro Wrestling Illustrated + Feud of the Year ( 2003 ) vs. Kurt Angle + Feud of the Year ( 2015 ) vs. The Undertaker + Match of the Year ( 2003 ) vs. Kurt Angle in an Iron Man match on SmackDown ! on September 16 + Most Improved Wrestler of the Year ( 2002 ) + Wrestler of the Year ( 2002 , 2014 ) + Ranked # 1 of the top 500 singles wrestlers in the PWI 500 in 2003 + Wrestling Observer Newsletter + Best Brawler ( 2003 ) + Best Wrestling Maneuver ( 2002 ) F @-@ 5 + Feud of the Year ( 2003 ) vs. Kurt Angle + Most Improved Wrestler ( 2002 , 2003 ) + Wrestling Observer Newsletter Hall of Fame ( Class of 2015 ) + World Wrestling Entertainment / WWE + WWE Championship ( 4 times ) 2 + King of the Ring ( 2002 ) + Royal Rumble ( 2003 ) + Slammy Awards ( 5 times ) + Hashtag of the Year ( 2015 ) – # SuplexCity + Match of the Year ( 2015 ) – vs The Undertaker at Hell in a Cell + Rivalry of the Year ( 2015 ) – vs The Undertaker + " Tell Me You Didn 't Just Say That " Moment of the Year ( 2015 ) – Coining " Suplex City " at WrestleMania 31 + The OMG Shocking Moment of the Year ( 2014 ) – Ending The Undertaker 's WrestleMania streak at WrestleMania XXX + 1 ^ Lesnar 's IWGP Heavyweight Championship reign at IGF is considered a continuation of his reign from NJPW . + 2 ^ When Lesnar won the title for the first time it was known as the WWE Undisputed Championship . His next two reigns were as simply WWE Champion , while his final one was as WWE World Heavyweight Champion . + + + = Constant k filter = + + Constant k filters , also k @-@ type filters , are a type of electronic filter designed using the image method . They are the original and simplest filters produced by this methodology and consist of a ladder network of identical sections of passive components . Historically , they are the first filters that could approach the ideal filter frequency response to within any prescribed limit with the addition of a sufficient number of sections . However , they are rarely considered for a modern design , the principles behind them having been superseded by other methodologies which are more accurate in their prediction of filter response . + + = = History = = + + Constant k filters were invented by George Campbell . He published his work in 1922 , but had clearly invented the filters some time before , as his colleague at AT & T Co , Otto Zobel , was already making improvements to the design at this time . Campbell 's filters were far superior to the simpler single element circuits that had been used previously . Campbell called his filters electric wave filters , but this term later came to mean any filter that passes waves of some frequencies but not others . Many new forms of wave filter were subsequently invented ; an early ( and important ) variation was the m @-@ derived filter by Zobel who coined the term constant k for the Campbell filter in order to distinguish them . + The great advantage Campbell 's filters had over the RL circuit and other simple filters of the time was that they could be designed for any desired degree of stop band rejection or steepness of transition between pass band and stop band . It was only necessary to add more filter sections until the desired response was obtained . + The filters were designed by Campbell for the purpose of separating multiplexed telephone channels on transmission lines , but their subsequent use has been much more widespread than that . The design techniques used by Campbell have largely been superseded . However , the ladder topology used by Campbell with the constant k is still in use today with implementations of modern filter designs such as the Tchebyscheff filter . Campbell gave constant k designs for low @-@ pass , high @-@ pass and band @-@ pass filters . Band @-@ stop and multiple band filters are also possible . + + = = Terminology = = + + Some of the impedance terms and section terms used in this article are pictured in the diagram below . Image theory defines quantities in terms of an infinite cascade of two @-@ port sections , and in the case of the filters being discussed , an infinite ladder network of L @-@ sections . Here " L " should not be confused with the inductance L – in electronic filter topology , " L " refers to the specific filter shape which resembles inverted letter " L " . + The sections of the hypothetical infinite filter are made of series elements having impedance 2Z and shunt elements with admittance 2Y . The factor of two is introduced for mathematical convenience , since it is usual to work in terms of half @-@ sections where it disappears . The image impedance of the input and output port of a section will generally not be the same . However , for a mid @-@ series section ( that is , a section from halfway through a series element to halfway through the next series element ) will have the same image impedance on both ports due to symmetry . This image impedance is designated ZiT due to the " T " topology of a mid @-@ series section . Likewise , the image impedance of a mid @-@ shunt section is designated ZiΠ due to the " Π " topology . Half of such a " T " or " Π " section is called a half @-@ section , which is also an L @-@ section but with half the element values of the full L @-@ section . The image impedance of the half @-@ section is dissimilar on the input and output ports : on the side presenting the series element it is equal to the mid @-@ series ZiT , but on the side presenting the shunt element it is equal to the mid @-@ shunt ZiΠ . There are thus two variant ways of using a half @-@ section . + Parts of this article or section rely on the reader 's knowledge of the complex impedance representation of capacitors and inductors and on knowledge of the frequency domain representation of signals . + + = = Derivation = = + + The building block of constant k filters is the half @-@ section " L " network , composed of a series impedance Z , and a shunt admittance Y. The " k " in " constant k " is the value given by , + + Thus , k will have units of impedance , that is , ohms . It is readily apparent that in order for k to be constant , Y must be the dual impedance of Z. A physical interpretation of k can be given by observing that k is the limiting value of Zi as the size of the section ( in terms of values of its components , such as inductances , capacitances , etc . ) approaches zero , while keeping k at its initial value . Thus , k is the characteristic impedance , Z0 , of the transmission line that would be formed by these infinitesimally small sections . It is also the image impedance of the section at resonance , in the case of band @-@ pass filters , or at ω = 0 in the case of low @-@ pass filters . For example , the pictured low @-@ pass half @-@ section has + . + Elements L and C can be made arbitrarily small while retaining the same value of k . Z and Y however , are both approaching zero , and from the formulae ( below ) for image impedances , + . + + = = = Image impedance = = = + + See also Image impedance # Derivation + The image impedances of the section are given by + + and + + Given that the filter does not contain any resistive elements , the image impedance in the pass band of the filter is purely real and in the stop band it is purely imaginary . For example , for the pictured low @-@ pass half @-@ section , + + The transition occurs at a cut @-@ off frequency given by + + Below this frequency , the image impedance is real , + + Above the cut @-@ off frequency the image impedance is imaginary , + + + = = = Transmission parameters = = = + + The transmission parameters for a general constant k half @-@ section are given by + + and for a chain of n half @-@ sections + + For the low @-@ pass L @-@ shape section , below the cut @-@ off frequency , the transmission parameters are given by + + That is , the transmission is lossless in the pass @-@ band with only the phase of the signal changing . Above the cut @-@ off frequency , the transmission parameters are : + + + = = = Prototype transformations = = = + + The presented plots of image impedance , attenuation and phase change correspond to a low @-@ pass prototype filter section . The prototype has a cut @-@ off frequency of ωc + = 1 rad / s and a nominal impedance k = + 1 Ω . This is produced by a filter half @-@ section with inductance L + = 1 henry and capacitance C = + 1 farad . This prototype can be impedance scaled and frequency scaled to the desired values . The low @-@ pass prototype can also be transformed into high @-@ pass , band @-@ pass or band @-@ stop types by application of suitable frequency transformations . + + = = Cascading sections = = + + Several L @-@ shape half @-@ sections may be cascaded to form a composite filter . Like impedance must always face like in these combinations . There are therefore two circuits that can be formed with two identical L @-@ shaped half @-@ sections . Where a port of image impedance ZiT faces another ZiT , the section is called a Π section . Where ZiΠ faces ZiΠ the section so formed is a T section . Further additions of half @-@ sections to either of these section forms a ladder network which may start and end with series or shunt elements . + It should be borne in mind that the characteristics of the filter predicted by the image method are only accurate if the section is terminated with its image impedance . This is usually not true of the sections at either end , which are usually terminated with a fixed resistance . The further the section is from the end of the filter , the more accurate the prediction will become , since the effects of the terminating impedances are masked by the intervening sections . + + + = The Snowmen = + + " The Snowmen " is an episode of the British science fiction television series Doctor Who , first broadcast on Christmas Day 2012 on BBC One . It is the eighth Christmas special since the show 's 2005 revival and the first to be within a series . It was written by head writer and executive producer Steven Moffat and directed by Saul Metzstein . + The episode is set in the Victorian era and sees the Doctor ( Matt Smith ) brooding with the assistance of Silurian Madame Vastra ( Neve McIntosh ) , her wife Jenny Flint ( Catrin Stewart ) and Sontaran Strax ( Dan Starkey ) , after the loss of companions Amy Pond and Rory Williams in the previous episode , " The Angels Take Manhattan " . He is forced out of hiding to investigate mysterious , sentient snowmen that are building themselves and meets Clara Oswald ( Jenna @-@ Louise Coleman ) , a governess also investigating the snowmen . They discover that the snowmen are being animated by the Great Intelligence ( voice of Ian McKellen ) with the help of a man named Dr Simeon ( Richard E. Grant ) . + Building upon the character 's surprise introduction in " Asylum of the Daleks " , " The Snowmen " introduces Clara as the Doctor 's new companion , though ultimately it would be a third version of her character that would travel with the Doctor starting with " The Bells of Saint John " . In addition to Clara , " The Snowmen " also introduces a redesigned TARDIS , revised title sequence and theme music , and sees changes to the Doctor 's costume . The special was produced in August 2012 , with location filming in Newport , Wales and Bristol . It received final ratings of 9 @.@ 87 million viewers in the UK , becoming the fourth most @-@ watched programme of Christmas Day . " The Snowmen " was met with mostly positive reviews from critics , most of whom received the introduction and character of Clara well . However , some felt that Grant and McKellen were underused as villains or the plot was slight because of the focus on characterisation . + + = = Plot = = + + + = = = Prequels = = = + + To promote the special , three prequels were released . The first was broadcast during the 2012 Children in Need telethon on 16 November 2012 , titled " The Great Detective " . The Silurian Madame Vastra , her human wife Jenny Flint , and the Sontaran Strax ( all returning from " A Good Man Goes to War " ) describe a number of strange phenomena to a shadowed fourth detective . The fourth detective reveals himself to be the Doctor , and tells the group that he has retired . + A second prequel , titled " Vastra Investigates " , was released online on 17 December 2012 . At the end of a case , Vastra and Jenny converse with an officer from Scotland Yard and apologise for Strax 's violent wishes for the culprit 's punishment . Vastra explains Strax 's alien origin as well as her own to the officer , much to his astonishment . Vastra reveals that she was awoken by an extension to the London Underground and initially disliked humans , though that changed when she fell in love with Jenny . On the carriage ride home , Jenny notices it is beginning to snow and Vastra notes that the snow should be impossible because there are no clouds in the sky . + A third prequel , titled " The Battle of Demon 's Run — Two Days Later " was released on the United States iTunes and Amazon Video stores on 25 March 2013 . Two days after the events of " A Good Man Goes to War " , Vastra and Jenny convince Strax that he is not mortally wounded and invite him to accompany them back to 1800s London . The scene had been filmed as an extra due to the anticipation that fans would ask how Strax was resurrected and came to be in Vastra 's employ . + + = = = Synopsis = = = + + In 1842 England , a young boy builds a snowman , but refuses to play with the other children . The snowman starts speaking to the boy , repeating his assertions that the other children are silly . Fifty years later , the boy has grown up to be Dr. Walter Simeon , proprietor of the Great Intelligence Institute . He hires men to collect samples of snow , which he places in a large snow @-@ filled globe in his laboratory before feeding the men to a group of animated snowmen . The Doctor , still despondent after losing his former companions Amy Pond and Rory Williams , has parked his TARDIS above Victorian London among the clouds . He uses his allies Vastra , Jenny , and Strax to keep people away from him . They also fill their time investigating mysteries throughout the city . + Elsewhere , barmaid Clara Oswin Oswald investigates a disturbance outside the tavern she works at and finds the Doctor walking by . He attempts to leave discreetly , but Clara follows him to a coach . Not wishing to become involved in matters , the Doctor instructs Strax to bring him a memory worm that will erase the last hour of Clara 's memories with just a touch . Before they can do so , they are surrounded by snowmen created from snow with psychic properties who attack the group . The Doctor realises that Clara 's thoughts are creating the snowmen and ends the threat by instructing her to think of them melting . Clara cautions the Doctor that if he wipes her memory , she will forget how to deal with the snowmen . The Doctor reluctantly allows her to go and ascends a staircase to the sky to return to the TARDIS . Clara follows him and knocks on the door , but she hides and flees down the staircase when the Doctor answers . Clara returns to her other job as governess for the children of Captain Latimer . She learns that Latimer 's daughter has been having horrible dreams about their previous governess returning from the dead . Clara realises that the pond that contains the old governess ' body is the only thing still frozen around them . She attempts to track down the Doctor but instead attracts the attention of Jenny , who takes her to see Vastra . Vastra tells Clara she gets only one word to impress the Doctor with if she wants his help . Clara chooses the word " Pond " , which shocks the Doctor and arouses his interest . + Acting on a tip from Strax , the Doctor visits the Great Intelligence Institute posing as Sherlock Holmes . He confronts Dr. Simeon and find a large glass globe in Simeon 's office that contains psychic snow . The Doctor speaks to the Great Intelligence , the entity that has been speaking to Dr. Simeon since he was a boy . He learns that the Great Intelligence has been controlling the snowmen and has taken interest in Latimer 's pond . The Doctor visits the pond and deduces that the Great Intelligence is using the old governess ' body as a DNA blueprint to form an ice creature that will retain its form and not melt . While Clara is putting the children to bed , the frozen body of the governess breaks into the house . The Doctor fights her off and is joined by Vastra , Jenny and Strax . Dr. Simeon arrives with more snowmen and tells them he wants the governess ' ice body . The Doctor flees with Clara to the roof of the mansion and then to the TARDIS hovering overhead . They are pursued by the ice governess , whom the Doctor traps under a layer of frozen ice crystals . Inside the TARDIS the Doctor gives Clara a TARDIS key , but the ice governess arrives and pulls Clara down off the cloud . + The Doctor picks up Clara and takes her back to Latimer 's mansion , placing her under medical care of Strax . He collects the ice fragments from the governess and places them in a souvenir London Underground biscuit tin . He and Vastra travel to Simeon 's lab , where the Doctor notes the Intelligence 's plan to replace humanity with ice creatures and holds up the tin with the necessary DNA . Dr Simeon grabs the tin and opens it only to find the memory worm , which latches on to him . The Doctor states that the Great Intelligence , which has been existing as a mirror of Dr Simeon 's thoughts , will vanish with the erasure of Simeon 's memories . Instead , the Intelligence reveals that it existed long enough that it can now control Simeon 's body , which it uses to attack Vastra and the Doctor . The influence of the Great Intelligence quickly wanes , and Simeon falls dead . Outside , a salt @-@ water rain has started , and the Doctor sees that another psychic ability has taken control of the snow from the Great Intelligence : the Latimer family , crying for Clara . Strax informs the Doctor upon his return to the Latimer mansion that Clara only has moments left , and she passes away as the Doctor returns the TARDIS key to her . At her funeral , the Doctor reads Clara 's full name on her tombstone and realises she is the woman he met in " Asylum of the Daleks " who became a Dalek , whom he refers to as " Soufflé Girl " . He gleefully announces that a person dying twice is an impossibility and , bidding farewell to his allies for now , the Doctor departs in the TARDIS to investigate and find Clara . The episode concludes in contemporary times , where a young woman resembling Clara walks through the same graveyard , pausing by Clara 's tombstone . + + = = = Continuity = = = + + The Second Doctor previously encountered the Great Intelligence in the serials The Abominable Snowmen ( 1967 ) , set in the 1930s , and The Web of Fear ( 1968 ) , set in the 1960s . In these stories , the Great Intelligence uses robot Yeti as its physical presence . The events of The Web of Fear are alluded to by the Doctor in " The Snowmen " when he presents the London Underground biscuit tin to the Great Intelligence in Dr Simeon 's laboratory ; the Intelligence states , " I do not understand these markings " , in reference to the 1967 London Underground map design on the tin . The Doctor remarks that the Underground is a " key strategic weakness in metropolitan living " , referring to ( and possibly setting in motion ) the future Yeti attack on London via the Underground . + Coleman previously played Oswin Oswald in " Asylum of the Daleks " , though the connection between the two characters is not clarified until Clara reveals she has an interest in soufflés , a trait that Oswin 's character also had . The Doctor , after meeting Clara , wistfully replies " those were the days " when she asks why he isn 't staying to get acquainted with her , which are the same words he tells Craig Owens ( " Closing Time " ) when Craig comments that the Doctor always wins . The final scenes at the graveyard establish that Clara shares the same name as Oswin , leading the Doctor to surmise they are the same person . As seen on her gravestone , Clara 's birthdate is 23 November , the date Doctor Who was first transmitted in 1963 . + + = = Production = = + + + = = = Writing and design changes = = = + + Writer Steven Moffat stated that he wanted an " epic " quality to the Christmas special . The story would also show how the Doctor had responded to losing his previous companions ; Moffat said that " I think he 's probably reached the point in his life where he 's saying , ' Friendship for me is just postponed bereavement — I want to be on my own for a while ' . " Moffat compared the withdrawn Doctor seen at the onset of the episode to the first appearances of the First Doctor ( William Hartnell ) in 1963 and the Ninth Doctor ( Christopher Eccleston ) in 2005 . He also attributed the idea of a retired Doctor to a plot proposed by Douglas Adams in the 1970s , but rejected by the production team at the time . Continuing the theme introduced with the series ' first five episodes , " The Snowmen " was promoted like a movie . A movie poster was released in the Radio Times , showing the Doctor and Clara ascending the ladder to the TARDIS . + The episode saw several major design changes for the series . " The Snowmen " is the debut of a redesigned TARDIS interior , as well as a new title sequence and variation of the theme tune . The new title sequence features a brief glimpse of the Doctor 's face , the first time since Survival , the final serial of the classic series in 1989 , that the Doctor 's face has been seen in the title sequence . Moffat had noticed that the TARDIS ' design was getting " progressively whimsical " and resembled more of a " magical place " rather than a machine . It was designed by series production designer Michael Pickwood , who stated that the new interior was also supposed to be " darker and moodier " and provide an easier access to the " gallery " of the ship when shooting . + The Doctor also wears a one @-@ off costume , Victorian @-@ themed , which Smith described as " a bit Artful Dodger meets the Doctor " . Moffat described the new outfit as a " progression " as the Doctor was in " a different phase of his life now " and felt more " grown @-@ up " and fatherlike . The costume was designed by Howard Burden for this episode . " The Snowmen " also contains several references to Sherlock Holmes , including the Doctor dressing up as him . Moffat is co @-@ creator of the BBC series Sherlock , for which Smith auditioned for the role of Doctor Watson before being cast as the Doctor . In addition , the incidental music during the scene bears a resemblance to the Sherlock theme . + + = = = Casting = = = + + This episode marks the return of Jenna @-@ Louise Coleman , who previously appeared in the series opener , " Asylum of the Daleks " . Coleman was cast because of her chemistry with Matt Smith , and especially because she was able to talk faster than him . She auditioned for the role of Clara , not Oswin from " Asylum " , as the concept of the two characters being the same only occurred to Moffat whilst casting for Clara . The production team requested that the press and fans who attended advanced screenings keep Coleman 's appearance a secret until " Asylum " was broadcast ; the effort was ultimately successful . Moffat stated that the introduction of a new companion made " the show feel different " and brought the story to " a new beginning " with a different person meeting the Doctor . Smith said that Clara was different from her predecessor Amy Pond ( Karen Gillan ) , which allowed the audience to see a different side of the Doctor . Coleman described her as resourceful and not intimidated , citing the reason for following the Doctor at the beginning as pursuing answers . The Clara who would become a travelling companion of the Doctor would not debut until the Spring premiere , " The Bells of Saint John " ; save for a brief cameo at the end of " The Snowmen " . Coleman stated that she played each version as individuals with " trust that there would be a payoff " to her mystery . + Also returning to the series are Neve McIntosh as Madame Vastra , Dan Starkey as Strax and Catrin Stewart as Jenny . All three previously appeared in " A Good Man Goes to War " and reprised their roles both in this episode and in the prequels . They returned due to the popularity of Vastra and Jenny ; Moffat considered a spin @-@ off featuring them , though he did not have the time to do it . Instead , he decided to bring them back in the main series . Richard E. Grant had previously played the Doctor on two occasions , as an alternative Tenth Doctor in the spoof charity special Doctor Who and the Curse of Fatal Death , which was written by Moffat and as an alternative Ninth Doctor in the animated story Scream of the Shalka which had been intended to be a continuation of the series before it was revived in 2005 . Smith commented that Grant was " born to be a Who villain . He pitches it on that perfect level and tone " . Grant 's appearance in Doctor Who was teased by the BBC via Twitter , announcing his appearance at midnight 5 August 2012 . Tom Ward was drawn to his role because of the quality of the script , and also stated his young children were pleased that he appeared in the programme . The Great Intelligence was voiced by Sir Ian McKellen . The two children Clara is governess to , Digby and Francesca , were played by real @-@ life brother and sister Joseph and Ellie Darcey @-@ Alden . + + = = = Filming and effects = = = + + " The Snowmen " was originally intended to be produced in the fourth production block of the series and be the first episode Coleman shot as her character ; however , it did not begin filming until the week of 6 August 2012 , after Coleman had worked on later episodes while Moffat was writing the Christmas special . The read @-@ through had taken place on 2 August 2012 . This was the first Christmas special to be filmed in BBC Wales ' new Roath Lock studios . Scenes featuring Coleman and several guest stars in a Victorian setting were filmed in Newport , Wales , while Coleman and Smith were also spotted filming in Bristol two weeks later on 21 August . Some scenes which used snow props were filmed in Portland Square , Bristol , where filming took place overnight on 21 – 22 August 2012 . Bristol was chosen because it had Victorian @-@ era architecture . Pickwood stated that his favourite set is the London Street with the back of the pub , which he said was based on a sixteenth @-@ century building in Oxford . The locations were blocked off and sprayed with fake snow . + The TARDIS on the cloud was achieved through a mix of fog on the studio floor and post @-@ production special effects . Director Saul Metzstein explained that it was difficult to achieve the desired look for the snowmen ; the first ones he likened to Zippy from Rainbow which was too " cute " of an appearance , and so the effects team created more menacing CGI faces . Clara 's introduction to the TARDIS introduced two novel effects for the show . The first was a single @-@ shot camera tracking from a few feet away from the TARDIS to its interior , with the implication of the TARDIS 's trans @-@ dimensional nature shown to the audience . In the following shot , the camera does a complete circle of the TARDIS console , an effect not seen since the early days of the show . Metzstein wanted to include this shot to further emphasize the " bigger on the inside than the outside " nature of the time machine . + In addition to the three prequel mini @-@ episodes , the cast also filmed an additional promotional video , " Songtaran Carols , " which the BBC uploaded during the days leading up to the broadcast . The video featured Starkey singing modified versions of several Christmas songs in character as Strax as his castmates look on , before everyone breaks character and begins laughing . + + = = Broadcast and reception = = + + " The Snowmen " aired on BBC One on 25 December 2012 at 5 : 15 pm , the same day on BBC America in the US and Space in Canada and the next day on ABC1 in Australia and on Prime in New Zealand . UK overnight ratings showed that the special had been watched by 7 @.@ 6 million viewers , coming in sixth for the night . Final consolidated figures ( not including BBC iPlayer viewers ) showed that the episode was watched by 9 @.@ 87 million viewers , coming in fourth for the night . It also received an Appreciation Index figure of 87 , higher than most of the Doctor Who Christmas specials . The iPlayer version had 1 @,@ 467 @,@ 220 views , making it the most popular TV show on iPlayer over Christmas . The US airing was seen by 1 @.@ 43 million viewers , with a 0 @.@ 6 rating in the demographic of adults aged 18 – 49 . + + = = = Critical reception = = = + + The episode received mostly positive reviews . Dan Martin of The Guardian called it " actually the best Christmas Special since ' The Christmas Invasion ' " and the first to be " actually scary " , with " everything we like " about Doctor Who and Christmas . He praised Coleman 's introduction as Clara and the gang of Vastra , Jenny , and Strax . IGN 's Matt Risley gave " The Snowmen " a score of 9 @.@ 4 out of 10 , describing it as " a rollicking , riveting masterclass in storytelling " which " refreshingly " lacked traditional Christmas references " in favour of some sparkling dialogue , gorgeous set design and fascinating characterisation " . While he felt that Grant and McKellen were underused , he was very positive towards Coleman 's " unpredictable " Clara . Radio Times reviewer Patrick Mulkern was pleased with the return of the Great Intelligence despite an inconsistency in the timeline he found , and praised the " lovely images " and direction of the special , though he felt the variation of the theme music " lacks the menace " of the original . While he was positive towards Clara , he was " unmoved by her death " as it was " plainly silly " that she did not look injured . + Nick Setchfield of SFX gave the special four and a half out of five stars , writing that " the power of emotion saves the day again " was appropriate in light of the festivities and many fairytales referenced in the story . Setchfield was positive towards the " terrific " comedy with Strax , Coleman and the " surprisingly underused " Grant , as well as the new title sequence and TARDIS . While he wrote that the subtle callback of the Great Intelligence was " a tad more interesting than the usual ' So , we meet again ! ' schtick " , he ultimately felt their threat " never quite comes into sharp relief " . Neela Debnath of The Independent wrote that " The Snowmen " was stronger than the previous year 's " The Doctor , the Widow , and the Wardrobe " as it was connected to the overall story of the series , but " still has a way to go if it is to live up to ' A Christmas Carol ' " . Despite feeling that it was " enjoyable " , she noted that " the story feels truncated and rushed " + The Mirror 's Jon Cooper also praised Coleman and the new side of the Doctor that was shown , comparing it to Rose Tyler ( Billie Piper ) challenging the Ninth Doctor ( Christopher Eccleston ) . However , he felt the character @-@ heavy story was to the detriment of the plot , which was " a classic Who set @-@ up that ultimately suffers from a lack of explanation [ and ] more set @-@ pieces than a coherent whole " . He felt that the episode may not have been accessible for casual viewers , but offered much for fans in time for the programme 's fiftieth anniversary . Dominic Cavendish of The Daily Telegraph gave " The Snowmen " three out of five stars , disappointed that it was not as scary as it had been hyped to be . While he was positive towards Smith and the TARDIS on the cloud , he criticised Strax and the " Sudoku @-@ like complexity " of the script . + The episode was nominated for the 2013 Hugo Award for Best Dramatic Presentation ( Short Form ) , alongside " Asylum of the Daleks " and " The Angels Take Manhattan " , but lost to the Game of Thrones episode " Blackwater " . + + = = DVD release = = + + " The Snowmen " was initially released as a standalone on DVD and Blu @-@ ray in the UK and North America . It was later included as part of the DVD / Blu @-@ ray box set Doctor Who : The Complete Seventh Series in September 2013 . + It has subsequently been reissued in several box set compilations , most recently alongside the Christmas specials between " The Christmas Invasion " and " Last Christmas " inclusive in a boxset titled Doctor Who – The 10 Christmas Specials on 19 October 2015 . + + = = Soundtrack = = + + Selected pieces of score from " The Snowmen " and the preceding Christmas special , as composed by Murray Gold , were included on a soundtrack released on 21 October 2013 by Silva Screen Records . + + + = No. 20 Squadron RAAF = + + No. 20 Squadron is a Royal Australian Air Force ( RAAF ) support squadron . Coming under the control of No. 96 Wing , it is responsible for the management of the airfield at RAAF Base Woomera , South Australia . The squadron originated as a maritime patrol unit during World War II . Raised in August 1941 , it operated PBY Catalina and Short Empire flying boats from bases in New Guinea , Queensland and the Northern Territory , conducting search @-@ and @-@ rescue , mine @-@ laying , anti @-@ submarine and bombing missions against Japanese targets in the Pacific theatre . Following the conclusion of hostilities , the squadron was disbanded in March 1946 . It was reactivated as an airfield support squadron in April 2015 . + + = = History = = + + + = = = World War II = = = + + No. 20 Squadron was formed at Port Moresby , New Guinea , on 1 August 1941 for a general reconnaissance role , under the command of Squadron Leader W.N. Gibson . Its establishment was six PBY Catalina flying boats and 133 personnel , but only five aircraft ( all transferred from No. 11 Squadron ) and 55 personnel were available initially . The squadron conducted long @-@ range patrols between bases scattered around the islands to Australia 's north in conjunction with No. 11 Squadron . On 18 November , No. 20 Squadron 's Catalinas were augmented by two Short Empire flying boats transferred from No. 11 Squadron . + On 25 November 1941 , following the loss of HMAS Sydney , one of No. 20 Squadron 's Catalinas was despatched to Western Australia to join a No. 11 Squadron Catalina in search @-@ and @-@ rescue missions , but they found only oil slicks . By the outbreak of war in the Pacific , No. 20 Squadron had a strength of six Catalinas and two Empire flying boats . Its personnel at the beginning of December numbered 14 officers and 118 men . The squadron undertook its first sortie of the Pacific War on 8 December ; a Catalina located three Japanese luggers in the vicinity of Thursday Island , Queensland . Later in the month it commenced anti @-@ submarine patrols and , in January 1942 , bombing raids against Japanese bases . As the Japanese advanced into the South West Pacific , No. 20 Squadron was also responsible for evacuating white civilians from areas threatened by invasion . On 21 January , one of its Catalinas located the Japanese fleet steaming for Rabaul and signalled a warning to the town 's Australian defenders before being shot down by anti @-@ aircraft fire ; it was the squadron 's first combat loss . + In the wake of the fall of Rabaul , the Catalinas of Nos. 11 and 20 Squadrons became the RAAF 's only offensive weapon against the Japanese . Their raids on Rabaul did little to stem the Japanese advance , and in the following months Port Moresby itself was subjected to increasingly frequent attacks , which destroyed aircraft , facilities , and squadron records . In February 1942 , the Short Empires operated by Nos. 11 and 20 Squadrons were transferred to the newly formed No. 33 ( Transport ) Squadron . No. 20 Squadron lost two Catalinas during patrols on 4 and 6 May ; the nine crewmen of the first were later found to have been captured and beheaded ; the crew of the second were also captured and subsequently disappeared without trace . + In response to the threat of invasion at Port Moresby , Nos. 11 and 20 Squadrons moved to Bowen , Queensland , on 7 May 1942 . They were soon attacking Japanese targets in Lae , Salamaua and Rabaul . On 27 June , each squadron contributed an aircraft to a four @-@ hour raid over Lae and Salamaua during which , as well as bombs , the RAAF crews dropped beer bottles to disrupt the enemy soldiers ' sleep — the sound they made falling through the air was , according to the official history , " something between a shrill whistle and a scream " . By 1 July , No. 20 Squadron 's strength was six Catalinas and 175 personnel , out of a planned establishment of nine aircraft and 415 personnel . Its prime responsibility in early 1942 was maritime reconnaissance as far as New Guinea , the Solomon Islands , and New Caledonia ; the latter half of the year saw a greater focus on night bombing . Now comprising 252 officers and men , the squadron relocated to Cairns on 11 November 1942 . From Cairns it continued to conduct reconnaissance , anti @-@ submarine and occasional bombing operations over the waters around New Guinea . Between December 1942 and March 1943 , No. 20 Squadron 's aircraft flew a total of 9 @,@ 629 hours and dropped 227 tons of bombs . The squadron 's role changed in June 1943 when it commenced mine @-@ laying operations over the Netherlands East Indies and the Philippines , though it continued to make some bombing raids and supply drops . + In September 1944 , No. 20 Squadron became part of No. 76 Wing RAAF , along with Nos. 42 and 43 Squadrons , and moved to Darwin , Northern Territory . All three squadrons operated Catalinas , their primary purpose being mine @-@ laying . On the night of 30 September , a Catalina of No. 20 Squadron was shot down while attacking a ship at Pomelaa in the Dutch East Indies ; the loss was compounded by the fact that one of the coordinators of the mining campaign , Lieutenant Commander P.E. Carr of the Royal Australian Navy , was aboard the plane and was captured by the Japanese . Another of the squadron 's Catalinas went down on the night of 27 / 28 January 1945 , possibly in a cyclone over the Timor Sea , during the campaign to mine Surabaya . In March , a detachment of four No. 20 Squadron aircraft , along with four from No. 43 Squadron , laid mines off the coast of southern China and Formossa as part of a No. 76 Wing offensive in this area ; these operations were conducted from Leyte Gulf in the Philippines . One of No. 20 Squadron 's Catalinas was lost on the night of 7 / 8 March , most likely owing to bad weather rather than enemy action . Three of the squadron 's aircraft mined the entrance to Hong Kong harbour on 8 April and , on 26 May , four of its Catalinas mined Wenchow harbour in China , the furthest north that any Australian aircraft infiltrated during the war in the Pacific . Three of its aircraft flew the RAAF 's last mine @-@ laying mission on 30 July . + No. 20 Squadron 's final wartime sortie was a patrol on 14 August 1945 . Following the end of the war , the squadron operated in the transport role and ferried Australian prisoners of war home from various locations in South East Asia . It relocated to RAAF Station Rathmines , New South Wales , on 21 November . No. 20 Squadron flew its last mission , a transport flight to Balikpapan , on 21 January 1946 , and disbanded at Rathmines on 27 March . + + = = = Post @-@ war re @-@ establishment = = = + + No. 20 Squadron was reactivated on 1 April 2015 to support airfield operations at RAAF Base Woomera , South Australia . Consisting of nine uniformed personnel and one Australian Public Service member under the command of Squadron Leader Simon Bartlett , the squadron formed part of No. 96 Wing , a component of Combat Support Group ( CSG ) . The airfield had previously been managed under the auspices of Aerospace Operational Support Group , but a command @-@ and @-@ control review commissioned by the Chief of Air Force recommended that , in common with other RAAF airfields , it should be administered by CSG . RAAF Base Woomera , incorporating Woomera Village , was one of two Air Force units formally established on 12 January 2015 as part of a reorganisation of the Woomera Range Complex , the other unit being RAAF Woomera Test Range . + The design of the reactivated squadron 's crest includes a wedge @-@ tailed eagle to denote courage and nobility , a woomera spear thrower to symbolise the town and its indigenous heritage , Sturt 's Desert Pea to represent South Australia , and the Pleiades star cluster , which features in the folklore of the local Kokatha people . + + + = Light Tank Mk VII Tetrarch = + + The Light Tank Mk VII ( A17 ) , also known as the Tetrarch , was a British light tank produced by Vickers @-@ Armstrongs in the late 1930s and deployed during the World War II . The Tetrarch was originally designed as the latest in the line of light tanks built by the company for the British Army . It improved upon its predecessor , the Mk VIB Light Tank , by introducing the extra firepower of a 2 @-@ pounder gun . The War Office ordered 70 tanks , an order that eventually increased to 220 . Production was delayed by several factors , and as a consequence , only 100 to 177 of the tanks were produced . + The tank 's design flaws , combined with the decision by the War Office not to use light tanks in British armoured divisions , ruled out the use of Tetrarchs in the North African Campaign . As a result , the majority of the tanks remained in Britain , although 20 were sent to the USSR as part of the Lend @-@ Lease program . In early 1941 , the Royal Armoured Corps formed three squadrons for use in overseas amphibious operations , one of which was equipped with Tetrarchs . In May 1942 , a small number of Tetrarchs formed part of the British force which participated in the invasion of Madagascar , and , in June 1942 , Tetrarchs were attached to the 1st Airborne Division after it was decided that the design allowed its use as an air @-@ portable light tank to support British airborne forces . The Tetrarchs were transported and landed in specially designed General Aircraft Hamilcar gliders . A lack of gliders prevented their participation in the Allied invasion of Sicily in 1943 ; instead they were attached to the new 6th Airborne Division and became part of the 6th Airborne Armoured Reconnaissance Regiment . + The division used approximately 20 Tetrarchs during the British airborne landings in Normandy in June 1944 . The tanks were successfully landed by glider , but they did not perform well . Several were lost in accidents , and those that did see action proved to be inferior in firepower and armour to the armoured fighting vehicles of the German forces . A few days after the beginning of the operation , the tanks were removed from direct engagement with German armour and used only to provide fire support . By August 1944 , most of the Tetrarchs in action were replaced with Cromwell cruiser tanks , and the remainder were replaced by the M22 Locust in December 1944 . + Tetrarchs did not see any further combat and were deemed obsolete by 1946 ; the last was retired in 1950 . There were several variations on the Tetrarch design , including the Alecto self @-@ propelled gun and the Light Tank Mk VIII , but none of these were ever used in active service with the British Army . + + = = Development history = = + + + = = = Initial development = = = + + The prototype of the Light Tank Mk VII ( A17 ) , nicknamed ' Purdah ' , was first developed in 1937 by Vickers @-@ Armstrongs as a private venture , and was intended to be sold either to the British Army or to foreign militaries . It was to be the latest in a series of light tanks produced by the company . The tank was designed to overcome the shortcomings of insufficient armament in earlier light tanks that were fitted only with machine guns . Vickers @-@ Armstrong installed on the Mk VIIs a 2 @-@ pounder 40 @-@ millimetre ( 1 @.@ 6 in ) main gun paired with a 7 @.@ 92 @-@ millimetre ( 0 @.@ 312 in ) Besa machine gun , and mounted the two guns in a two @-@ man turret . The tank possessed a maximum of 14 millimetres ( 0 @.@ 55 in ) of armour . The prototype weighed approximately 16 @,@ 800 pounds ( 7 @,@ 600 kg ) and was powered by a 165 @-@ horsepower ( 123 kW ) Meadows engine . Suspension was on eight road wheels , four per side , with no separate driver or idler wheels and it was capable of a 40 miles per hour ( 64 km / h ) top speed . The Mk VII design relied on an unusual steering method and a mechanical system incorporated into earlier Vickers models . The front wheels could be steered to allow for gentle turns by bending the tracks . For sharper turns , the system returned to the conventional method of braking one track to turn the tank ; the dual system of turning was designed to lessen mechanical strain on the MkVII and reduce its power wastage . The suspension system was also a new design that relied on struts with pockets of air for springing and cushions of oil for damping , and each of the wheels was independently sprung . + The War Office examined the design and put the prototype through a series of trials during May and June 1938 ; the model was tested as a possible " light cruiser " since War Office light tank needs were already met by its predecessor , the Mark VI . The War Office then took the view that the tank was not acceptable as a light cruiser because the Nuffield A13 offered better speed and obstacle crossing performance . Despite this , it was decided that it was essential for some Tetrarchs to be produced , and it was suggested that they be brought in at the end of the light tank program . Accordingly , the War Office gave the Tetrarch the official General Staff specification number A17 , and , in November 1938 , accepted it for limited production after requesting a few minor changes which included the fitting of an external fuel tank to increase the tank 's range . + + = = = Production = = = + + The number to be produced was subject to fluctuation as the War Office vacillated in their demand ; in July 1938 , it requested that 70 of the tanks be produced , then increased the request to 120 after a three @-@ day conference in November . Production was to begin in July 1940 , but meanwhile the War Office temporarily returned to its original order of 70 before increasing the number to 100 . The number further increased to 220 after Metropolitan Cammell Carriage and Wagon , a company part owned by Vickers @-@ Armstrong that would be producing the tanks , indicated it had already ordered armour plating for that many tanks . + Production of the tank was delayed by a number of factors . The War Office put their order on hold in a post @-@ Battle of France decision to focus military production on infantry and cruiser tanks , due to the poor performance of British light tanks during that battle . Due to the shortage of more suitable tanks , light tanks that were not designed for use against German armour , were nevertheless deployed against them ; the resulting high casualties led the War Office to re @-@ evaluate the suitability of the light tank design . The pre @-@ war role of the light tank , that of reconnaissance , meanwhile had been found to be better suited to scout cars that used smaller crews and had better cross @-@ country abilities . Further delays were caused by the bombing raids of the Luftwaffe during May 1941 against the factories where the tanks were assembled . + The cumulative effect of these delays resulted in the production of only a small number of Mk VIIs ; estimates place the final total produced to be between 100 and 177 . The name ' Tetrarch ' was given to the Mk VII , on 22 September 1941 , on the orders of the War Office . The last of the tanks were built in the first quarter of 1942 and delivered at the end of the year . + + = = = Transfer to airborne role = = = + + The War Office and the Army were concluding , at this point , that light tanks were a liability and too vulnerable for use in further combat , and the Tetrarch was considered to be obsolete . This decision may have marked the end for the Tetrarch in active service ; several of the tanks destined to be deployed to the Eighth Army in the Middle East for the North African Campaign were left in Britain when their cooling systems were determined to be unable to cope with the intense North African heat . + The demise of Tetrarch was prevented by a decision made by the War Office in mid @-@ 1941 , as it was considering the equipment to be used by Britain 's fledgling airborne forces , formed in June 1940 under the orders of the Prime Minister , Winston Churchill . When selecting the equipment for the airborne forces , officials at the War Office concluded that gliders would be an integral component ; gliders would transport troops and heavy equipment , which , by 1941 , was to include artillery and some form of tank . Plans to transport a tank went through a number of revisions , but , by May 1941 , the feasibility of a 5 @.@ 5 metric tons ( 5 @.@ 4 long tons ) tank to be carried for 350 miles ( 560 km ) in a glider was accepted , although the aircraft would have to be specifically designed for the task . In a conference held on 16 January 1941 , it was decided that the General Aircraft Hamilcar , currently under development , would be used to transport a single Tetrarch tank or two Universal Carriers . The Tetrarch was chosen because it was an obsolete design , and was therefore available to be used by the airborne forces . + Beginning in January 1944 , training exercises were conducted carrying the Tetrarchs and their crews inside Hamilcar gliders . These exercises were successful ; during the training by ' C ' Squadron of the Glider Pilot Regiment , which specialised in flying the Hamilcars , over 2 @,@ 800 lifts were made with an average of 50 lifts per crew . Only three incidents resulted in fatalities or injuries , with seven pilots killed during the training . When the Tetrarch was re @-@ designated as an airborne tank , several changes were made to its design . A number of tanks had their 2 pounder guns replaced with a 76 @.@ 2 @-@ millimetre ( 3 @.@ 00 in ) infantry support howitzer ; these tanks were then designated as Tetrarch 1 CS ( Close Support ) . Additionally , Littlejohn adaptors were added to those Tetrarchs which still possessed their 2 pounders to increase their muzzle velocity and armour penetration . + The Tetrarch experienced several setbacks throughout its development and deployment with the Army and airborne forces . One of the major problems was the limited number of these tanks that existed after production ended in 1942 , which particularly affected the airborne forces . The transport of 20 of the tanks to the USSR under the Lend @-@ Lease Act depleted the number available for use by airborne forces , as did the loss of several more during Operation Ironclad , the invasion of Madagascar . A Royal Armoured Corps report issued in December 1942 stated that approximately 50 Tetrarchs were available for use . In a memorandum , dated January 1943 , by Major General George F. Hopkinson , commander of the 1st Airborne Division , Hopkinson complained that he had been informed that 70 of the tanks were available , whereas only 50 actually remained , with no reserves to replace those lost in combat . This lack of sufficient replacement reserves , combined with a War Office report that some 287 airborne tanks would be required for the 1st Airborne Division and an unnamed airborne division to be formed in India , led to the Tetrarch 's eventual replacement by the US M22 Locust . + + = = Performance = = + + A number of design faults of the Tetrarch were revealed through its operational use . Its size limited the possible crew to three , a driver in the hull and a gunner and commander in the turret , resulting in too few crew members to operate the Tetrarch effectively . The gunner or commander , in addition to his own duties , had to act as loader for the 2 pounder , which caused delays in combat . A report on the tank written in January 1941 stated that as the commander had to both fight and control the tank , controlling a troop of Tetrarchs during combat would be almost impossible . + Problems were also found with the Littlejohn adaptor fitted to the 2 @-@ pounder to increase its range and penetration power ; after they had been fitted the adapters could not be removed , and could only fire specially designed armour @-@ piercing rounds , which took time to manufacture . + The War Office also considered the Tetrarch 's cooling system faulty , making the tank unsuitable for service in hotter climates , such as the Middle East and North Africa . + + = = Operational history = = + + + = = = Lend @-@ lease = = = + + The first Tetrarchs were delivered to the Army in November 1940 , and were initially deployed with the 1st Armoured Division ( which was being refitted after losing the majority of its previous tanks during the Battle of France ) and the newly formed 6th Armoured Division . However , the faults discovered with the Tetrarch cooling system precluded them from being integrated into units that were sent to the Middle East to participate in the North African Campaign . Shortly after , all light tanks were discarded from the establishments of British armoured divisions as not suitable for further service . + The Tetrarchs remained in Britain , and would probably have been used as training vehicles before being retired from service , but on 22 June 1941 the German invasion of the USSR , Operation Barbarossa began , and the USSR became an ally of Britain . The Lend @-@ Lease program , begun in March 1941 by the United States of America to supply defensive materials to Britain and China , was therefore extended to the USSR . As part of the program , the British government began supplying war materials to the USSR , which in early 1942 , included a shipment of 20 Tetrarchs , as well as a number of Valentine and Matilda Mk I Infantry tanks . The Soviet military utilised a greater number of light tanks than the British , and so could use the Tetrarchs . When the tanks arrived in the USSR , however , it was apparent that the design problems with the cooling system were also present in cold conditions ; additionally , the cold weather had a deleterious effect on the tank 's suspension and tracks . Additional testing of the Tetrarchs was conducted by the Soviet military and the design was admired for its controllability , manoeuvrability , and speed , as well its ability to run on low @-@ quality fuel , unlike contemporary Soviet designs . The thinness of the Tetrarch 's armour was found to be a problem and one which could not be solved , as the weight of extra armour plating caused an unacceptable reduction in the tank 's speed . Despite these drawbacks in the Tetrarch 's design , Soviet authorities believed it to be comparable to the T @-@ 70 light tank in use at the time , and decided that it was suitable to be used in combat . A number of Tetrarchs were sent to Tank Training Schools which were subsequently sent into battle , and in September 1943 two were assigned to the 132nd Separated Tank Battalion , which was attached to the 5th Guards Tank Brigade ; both tanks were destroyed in combat , one on 30 September and the other on 2 October , the latter a casualty of artillery fire . Several were also used for propaganda purposes , appearing in photographs of Soviet troops who were fighting in the Caucasus region . + + = = = Operation Ironclad = = = + + In mid @-@ 1941 , the Royal Armoured Corps in Britain created three tank squadrons for special overseas operations , known as ' A ' , ' B ' and ' C ' Special Service Squadrons . Both ' A ' and ' B ' Squadrons were equipped with Valentine Infantry tanks and Mark VIc light tanks , but ' C ' Squadron was equipped with twelve Tetrarchs transferred from the 2nd Armoured Brigade , 1st Armoured Division . On 31 July 1941 , ' C ' Squadron was officially activated and immediately received orders to prepare for overseas service alongside ' A ' and ' B ' Squadrons in an unspecified tropical climate . All three squadrons were transported to Inverary in Scotland for intensive training that focused on embarkation and disembarkation from ships and landing craft to prepare them for action in potential amphibious operations . In early September , elements of ' C ' Squadron , including six Tetrarchs , formed part of a force which sailed for Freetown in West Africa ; during this period of the war there were fears that the Spanish government might enter the conflict on the side of Germany , and the force was readied to capture a number of Spanish islands off the coast of Africa if this occurred . These fears proved groundless , and in March 1942 , the unit returned to Britain to join the rest of the squadron in training . + The next assignment , Operation Ironclad , was the invasion of Madagascar , the third largest island in the world and then under Vichy French control . The Prime Minister and the Combined Chiefs of Staff decided that Madagascar should be occupied as rapidly as possible to deny the port of Antsirane to Japanese naval forces , which had recently advanced into the Indian Ocean . Operation Ironclad was under the command of Major General Robert G. Sturges and consisted of No. 5 Commando , 29th Independent Brigade Group , and the 17th and 13th brigade groups from 5th Infantry Division . The 29th Brigade formed the core of the invasion force due to its training in amphibious operations , and under its command was ' B ' Special Service Squadron , created by amalgamating six Valentines from ' B ' Squadron and six Tetrarchs from ' C ' Squadron into a single unit . The squadron was formed into four troops , one Headquarters troop of three Valentines and one Tetrarch , one of four Valentines , and two formed from the remaining five Tetrarchs . The invasion force assembled off the west coast of the northern tip of Madagascar on 4 May , near Antsirane and the bay of Diego Suarez . The invasion plan called for an amphibious assault landing on four beaches on the west side of the tip , which would allow the British forces to advance approximately 20 miles ( 32 km ) and approach Antsirane from the rear . Information about the landing beaches , the defences possessed by the port , and the Vichy French defending forces was limited and vague , although it was believed that the defenders had no weapons capable of penetrating the armour of a Valentine tank . + The landings began at 04 : 30 on 5 May , with 5 Commando landing at Courrier Bay and the three infantry brigades and ' B ' Squadron landing at Ambararata Bay . The objective of the infantry brigades and their armoured support was to take control of Antsirane and a nearby town , but although the infantry landed successfully , ' B ' Squadron had more trouble ; the area of beach designated for its landing craft was blocked for several hours after a Tetrarch came loose from a landing craft and became stuck in the sand . The infantry brigades advanced toward Antsirane without the squadron , but eventually two Valentines and a single Tetrarch were dispatched in support , catching up with the lead elements of the infantry near the town of Anamakia . Here the invasion force encountered the first French defences , consisting of camouflaged trenches and pillboxes dug in along a ridge . The tanks attempted to breach them , but the rocky ground made manoeuvring difficult and they could not close with the pillboxes and trenches ; they engaged a number of targets with 2 pounder and machine @-@ gun fire , but the line had to be cleared by an infantry assault later in the day . The tanks were ordered to outflank the defences and advance further into the island , and they were soon joined by two other Tetrarchs dispatched from the beaches ; the small force continued to advance until it encountered the Vichy French main line of defence . This had been built prior to the First World War and included camouflaged pillboxes , machine @-@ gun nests and dug @-@ in 75 mm artillery pieces ; the latter , although not specifically designed for an anti @-@ tank role , could penetrate the armour of both the Tetrarchs and the Valentines . The two Valentines advanced first but were knocked out by artillery fire , and two Tetrarchs that were moving behind them suffered the same fate ; the third Tetrarch retreated in order to report on the French resistance , machine gunning a motorcycle combination and a truck it encountered on the way back . + The commander of the Tetrarch made his report , and was then ordered to take command of four Valentines and two Tetrarchs which had recently arrived and once again attempt to breach the French defences . The tanks followed the road leading to the defensive line and then attempted to out @-@ flank the line by advancing from the right @-@ hand side , using several hills as cover ; the artillery pieces were able to turn and face the assault , however , and one Valentine and one Tetrarch were hit and destroyed . The remaining tanks exchanged several volleys of fire with the artillery pieces before retreating back to their original positions . The French line was eventually broken by 29th Brigade , aided by an amphibious assault by Royal Marines ; the remaining tanks of ' B ' Squadron , two Valentines and three Tetrarchs , remained in defensive positions until the afternoon of 6 May , coming under sporadic artillery fire which disabled another Valentine . The squadron played no further part in the battle , as the Vichy French authorities negotiated a formal surrender the following day , although French troops would continue to engage the British occupying force in guerrilla warfare until late November . ' C ' Squadron suffered heavy casualties during the invasion ; only one Valentine and three Tetrarchs out of twelve tanks were functional by 7 May , and the squadron had suffered seven killed and six wounded . It remained in Madagascar until early 1943 , when it was shipped to India and took part in the Burma Campaign as part of 29th Brigade . + + = = = Operation Tonga = = = + + Because of a lack of equipment training facilities in mid @-@ 1940 , when the British airborne establishment was formed , the War Office was able to accept only 500 volunteers for training as airborne troops . Progress in setting up proper training facilities and acquiring suitable transport aircraft was so slow that the first British airborne operation , Operation Colossus , was conducted by a retrained Commando unit . By 1942 , there existed specifically trained airborne units , including the 1st Airborne Division , and on 19 January 1942 the War Office decided that a light tank unit would be one of the support units attached to the division . This unit , designated the Light Tank Squadron , was to be formed of nineteen light tanks and would operate to the fore of the division , using their tanks ' speed to capture objectives and then holding them until relieved by other units . The obvious unit for conversion was ' C ' Special Services Squadron , as it was trained to act as an independent tank unit and , more importantly , was the only unit that was still using Tetrarchs ; it had been re @-@ designated as an airborne tank by the War Office . ' C ' Squadron was officially transferred to the 1st Airborne Division on 24 June 1942 , bringing with it seven Tetrarchs among its other vehicles . The unit immediately began training , but was not attached to the 1st Airborne Division for long ; during mid @-@ 1943 , the division was transported to the Middle East so it could participate in the Allied invasion of Sicily . ' C ' Squadron remained in Britain , as not enough Hamilcar gliders had been built by the time the division departed to transport its Tetrarchs ; the squadron was transferred to the 6th Airborne Division , which had been raised in April 1943 , and ' C ' Squadron remained with it for the rest of the conflict . The squadron continued to train as an air @-@ portable unit , and participated in a number of exercises to prepare for its new duties , including reconnaissance of enemy positions and counter @-@ attacking enemy infantry and armour . + On 13 December 1943 , the War Office decided to expand the squadron into a regiment equipped with a combination of light tanks and conventional reconnaissance vehicles such as scout cars , and on 1 April 1944 , it was re @-@ designated as the 6th Airborne Armoured Reconnaissance Regiment . The regiment consisted of a Headquarters Squadron , a Light Tank Squadron and a Reconnaissance Squadron ; two Tetrarchs , the Mark 1 CS variation , were attached to the Headquarters Squadron , but the Light Tank Squadron , also known as ' A ' Squadron , received the majority of the Tetrarchs . ' A ' Squadron had approximately nineteen Tetrarchs split between six troops , two of which were of the CS variation and the rest were armed with 2 pounders fitted with Littlejohn adaptors . On 24 May 1944 , after participating in a further series of exercises and manoeuvres , ' A ' Squadron moved from their training area to a transit camp at Tarrant Rushton airfield , while the rest of the regiment moved to RAF Brize Norton airfield the next day ; from these two airfields , the regiment would be transported from to participate in the British airborne landings in Normandy . The operation began on the night of 5 June , with the deployment of 6th Airborne Division to eastern Normandy . It was tasked with protecting the eastern flank of the Allied seaborne landings , securing strategically important areas east of Caen , capturing several important bridges over the Caen Canal and River Dives , and destroying a coastal artillery battery . Insufficient transport aircraft were available to land all three of the division 's brigades simultaneously ; one would have to be landed in a second lift later in the day . Major General Richard Gale had initially intended for the 6th Airlanding Brigade , to which the 6th Airborne Armoured Reconnaissance Regiment was attached , to be landed first ; however , aerial photography revealed that anti @-@ glider poles had been erected in the landing zone selected for the brigade . Therefore , Gale decided that the 3rd Parachute Brigade and 5th Parachute Brigade ( which did not utilise gliders ) should land in the first lift to clear the landing zones , allowing the 6th Airlanding Brigade to land in the second lift . + The Horsa and Hamilcar gliders of the brigade landed at 21 : 00 on 6 June in a landing zone cleared of obstructions by the 5th Parachute Brigade . The primary tasks of the brigade were to bring in reinforcements and supplies , and to aid the two parachute brigades in consolidating the area held by the division ; the 6th Airborne Armoured Reconnaissance Squadron was to aid in the latter task , acting as a reconnaissance force to scout out German positions and impede the movement of German forces attempting to counter @-@ attack . The Tetrarchs of ' A ' Squadron were to play an integral part in this reconnaissance role due to their speed , but the squadron 's strength of twenty tanks was severely depleted by the time it landed in Normandy . It lost one tank before the formation landed when the Tetrarch broke loose of its shackles and crashed through the nose of the glider that was carrying it , causing both to fall into the sea mid @-@ flight . The squadron 's strength was further weakened when two gliders collided with each other in the landing zone , destroying themselves and the Tetrarchs they carried ; a third Hamilcar hit another Tetrarch as it was being unloaded and flipped the tank upside down , rendering it unusable , although the crew escaped without injury . The surviving tanks were then rendered temporarily immobile when parachute rigging lines became tangled in their suspensions , forcing their crews to cut the lines away with welding torches . + The squadron retrieved all of the remaining Tetrarchs and advanced to the south of the landing zone to link up with the rest of the regiment ; there , they received orders to support the 8th Parachute Battalion in the Bois de Bavent area and conduct reconnaissance duties . After linking with the battalion , the squadron began reconnoitring , and engaged German infantry and armour they encountered . By the end of 7 June , two Tetrarchs had been lost to enemy action , one destroyed by a German self @-@ propelled gun and the second by hitting a mine . The division was reinforced by British troops who were advancing from the invasion beaches and it began to push through Normandy , while the squadron continued its reconnaissance duties . At this time , Gale decided to avoid , when possible , engaging the Tetrarchs with German armour , as they proved to be completely outclassed by the German tanks and self @-@ propelled guns , such as the Panzer IV and the Sturmgeschütz III . Instead , when the division required armoured support , it summoned it from armoured units outside the division , and the Tetrarchs were used to support infantry patrols and provide fire support . By August , in the division 's preparation for the planned breakout from the Normandy bridgehead , the majority of Tetrarchs in ' A ' Squadron were replaced with Cromwell fast cruiser tanks ; only three Tetrarchs remained , assigned to the Headquarters troop of ' A ' Squadron . + + = = = Post @-@ war = = = + + Operation Tonga was the last that Tetrarchs saw of active combat . During the first week of October 1944 , the 6th Airborne Armoured Reconnaissance Regiment underwent an extensive reorganization , in which it was completely restructured , and all the remaining Tetrarchs were retired . They were replaced with the M22 Locust , a purpose @-@ built airborne light tank of American design ; eight Locusts were used by the regiment in March 1945 during Operation Varsity , the airborne operation to cross the river Rhine . A report issued by the Director ( Air ) of the War Office in January 1946 confirmed that the Tetrarch design was considered obsolete , and any light tanks used in post @-@ war airborne formations would be entirely new in design . A small number of Tetrarchs remained in service with the 3rd Hussars until 1949 ; a Hamilcar glider flight was stationed at RAF Fairford , and a troop of Tetrarchs was kept by the regiment for training exercises with the gliders . However , glider training by the regiment was stopped in 1950 and the Tetrarchs withdrawn from service . + + = = Variants = = + + There were several variants of the Tetrarch design . The first was the Light Tank Mk VIII , Vickers @-@ Armstrong 's proposed successor to the Tetrarch . The Mark VIII was also known as the Harry Hopkins , named after President Roosevelt 's chief diplomatic advisor , and was given the General Staff design number A25 by the War Office . The Mark VIII was intended to improve upon the design of the Tetrarch in a number of areas . It had thicker armour than the Tetrarch , with the front hull and turret armour increased to a thickness of 38 millimetres ( 1 @.@ 5 in ) and the side armour to 17 millimetres ( 0 @.@ 67 in ) , and the turret and hull given more sloped surfaces to help deflect shells fired at the tank . The dimensions of the Tetrarch were also changed ; the Mark VIII was longer by 6 inches ( 0 @.@ 15 m ) , wider by 1 foot 3 inches ( 0 @.@ 38 m ) and heavier . The new tank was no longer air @-@ portable , as it was too heavy to be carried by a Hamilcar . The 12 @-@ cylinder engine of the Tetrarch was fitted to the Mark VIII , although the increased weight meant that its maximum speed decreased to 30 miles per hour ( 48 km / h ) ; its armament also remained the same as that of the Tetrarch . The War Office authorised the construction of three prototype models in April 1941 . The new design was considered a success , and the Tank Board of the War Office ordered 1 @,@ 000 to be constructed in September . However , problems were encountered with further tests of the prototypes , and a report issued in December 1942 stated that production of the Mark VIII had been delayed due to developmental problems . These problems continued to persist into 1943 , when the War Office decided against using the tank in active service ; approximately 100 Mark VIIIs were produced by 1945 , when production ended . + A second variant on the Tetrarch design was the Tetrarch Duplex Drive ( " Tetrarch DD " ) . The Duplex Drive system was invented by Nicholas Straussler , and was designed to allow a tank to ' swim ' through water and participate in amphibious operations . The system functioned by erecting a large waterproof canvas screen around the tank above its tracks , which was supported by thirty @-@ six inflatable tubes and steel struts ; this gave the tank sufficient buoyancy to float , and was then propelled along by a small propeller powered by the tank 's engine . The screen could be collapsed by using a small explosive charge once the tank reached land . The system was fitted during June 1941 , as the Tetrarch was the lightest light tank available at the time ; the converted tank was successfully tested on a number of lakes and reservoirs , allowing the Duplex Drive system to be tested on heavier tanks , such as the Valentine . The system would be used during Operation Overlord , when M4 Sherman medium tanks would land on the invasion beaches . + + + = QuackShot = + + QuackShot , known in Japan as I Love Donald Duck : Georgia Ou no Hihou ( Japanese : アイラブドナルドダック グルジア王の秘宝 , Hepburn : Ai Rabu Donarudo Dakku Gurujia Ō no Hihō ) , is a 1991 platforming video game developed and published by Sega for the Sega Genesis . The game was released in Europe in 1991 , in North America on December 19 , 1991 and in Japan on December 20 , 1991 . QuackShot stars Donald Duck and his three nephews , Huey , Dewey , and Louie , as treasure @-@ hunters , and is part of a series of games published by Sega that were based on Walt Disney cartoon characters . + QuackShot was released to mostly positive reviews from video game journalists . The game was universally lauded for its graphics , with magazines like Sega Pro describing them as " some of the best graphics around . " The game was also praised for its music and puzzles , as well as their clever use in the game . However , QuackShot was criticized for its controls , being described by IGN as " float @-@ y " and making certain segments of the game unnecessarily difficult . The game was also criticized for its lack of difficulty overall as well as its lack of speech samples , which several other Genesis games of the time had . + + = = Gameplay = = + + The player , as Donald , ventures through a variety of side @-@ scrolling levels . Generally , each level is divided into an overland part and a dungeon , such as the Maharajah 's palace or the temple in which the Great Duck Treasure resides . Although the player may choose any order to play the overland sections , various obstacles prevent the player from entering the dungeons outside of a specific order . In addition to this , some levels provide the player with vital clues which solve puzzles needed to progress in later sections . Once Donald has completed the overland section of an area , he may leave by calling his nephews ' biplane , and will return to the dungeon entrance of that area if the player chooses to return . + Donald is armed with a special gun that can shoot plungers , popcorn or bubble gum . Donald has unlimited plungers which can only temporarily stun enemies ( though bosses can still be damaged with plungers ) , and can collect popcorn and gum along the way or get the latter from Gyro Gearloose . Later in the game , the plunger is upgraded to act as a temporary platform to climb walls with and , when stuck to a passing bird , allows Donald to traverse longer distances . In Duckburg , India and Egypt , Donald can also pick up chili peppers which increase his temper , eventually temporarily allowing him to become invincible , increase his speed and knock out enemies in his path . + + = = Plot = = + + While Donald is flipping through some books in Scrooge McDuck 's library , a map falls out of a book relating to the treasure of King Garuzia , ruler of the Great Duck Kingdom in ancient times . The map leads to the location of the king 's most prized possession , hidden in a secret place shortly before his death . Donald thinks this is his path to riches . Unfortunately Big Bad Pete overhears and pursues Donald throughout the game hoping to steal the treasure . + Teamed with his nephews Huey , Dewey , and Louie , and using the partial map from the library , Donald begins his search in Duckburg , with the trail being directed to an Aztec pyramid in Mexico . Outside the pyramid , he is directed by a " sweet seniorita " to obtain a " hero key " from an explorer back in Duckburg to open the pyramid . Inside the pyramid , Donald meets Goofy , who gives him a strange note and a plunger to help him reach higher places , and tells him that Gyro Gearloose is looking for him back in Duckburg . Travelling across the rooftops of Duckburg to meet Gyro , Donald is given Gyro 's latest invention , bubblegum ammo that can break through walls . The last location on the partial map is Count Dracula 's castle in Transylvania , where Donald encounters a ghost who tells him that the Count carries the real treasure map . + After defeating Dracula , Donald receives a more complete map . In India , Donald enters the palace of the Maharajah , where she challenges him to defeat the tiger in her garden in exchange for a Sphinx Tear . Donald succeeds and receives the Sphinx Tear , which is the key to open a temple in Egypt . Donald is able to solve the " Riddle of the Sphinx " using the note Goofy had given him , and obtains the Scepter of Ra before escaping in a mine cart . From there , he journeys to the South Pole , where he finds a key frozen in ice , and uses the Scepter of Ra to melt the ice and grab the key . The key unlocks the hold of a Viking ship , which contains an ancient diary with the secret to locating the treasure . The ship is haunted by ghosts , and the Viking captain sends Donald below decks to get rid of them . After defeating a skeletal Viking warrior , Donald returns to the deck , where the captain informs him that the diary is hidden in ice near the South Pole , and gives him an " ancient Viking plunger " that attaches to flying creatures . Donald then returns to the South Pole , hitching a ride on one of Pete 's bird minions to reach the diary . + However , upon finding the diary , Pete shows up , holding Donald 's nephews hostage in exchange for the diary . After giving Pete the diary , Donald travels to Pete 's hideout to defeat Pete and get the diary back . The diary reveals that the map , when dipped in water , will reveal the location of the Great Duck Treasure . Donald flies to the island where the treasure is hidden and manages to evade its traps in order to reach the treasure vault . After defeating the ancient spirit guarding the treasure , Donald opens the vault only to find a simple stone statue . When the disappointed Donald returns home , Huey , Dewey and Louie accidentally break the statue , which reveals a golden jeweled necklace was hidden inside . Donald gives the necklace to Daisy and the two fly off into the sunset together . + + = = Development and release = = + + QuackShot was developed and published by Sega for the Sega Genesis as part of a series of games that were based on Walt Disney cartoon characters . The game was released in Europe in 1991 , in North America on December 19 , 1991 and in Japan on December 20 , 1991 . QuackShot was later released as part of a bundle called The Disney Collection for Genesis in 1996 alongside Castle of Illusion . The game was also ported to the Sega Saturn and released exclusively in Japan alongside Castle of Illusion again as part of the Sega Ages series in 1998 , entitled Sega Ages : I Love Mickey Mouse . + + = = Reception = = + + QuackShot received a mostly positive response from critics upon release . GameRankings , an aggregator for video game reviews , assigned the game a score of 77 % based on 2 reviews . Mega placed the game at # 7 in their " Top Mega Drive Games of All Time " list . MegaTech magazine praised the game 's graphics , but criticized the game 's easy difficulty level , explaining simply that " the graphics are excellent , but the game is easy to complete . " Damian Butt from Sega Pro also praised the graphics , stating that the game has " without [ a ] doubt some of the best graphics around " and that " the sprites and backgrounds are consistently excellent . " He also noted the game 's various puzzles and their use in the game , explaining that " [ e ] ven if the ideas are not original , the way they are strung together to accelerate the pace to overload is nothing short of breath @-@ taking . " Levi Buchanan from IGN gave QuackShot a 7 @.@ 3 / 10 , also lauding the graphics and animation as excellent and saying the music was pleasing . + Butt criticized Donald 's controls in certain situations in the game , as well as the difficulty of some levels and puzzles . Buchanan also criticized the controls , calling them " float @-@ y " and noted the difficulty in executing precision jumps , explaining that " [ i ] t 's far too easy to over- or under @-@ shoot a narrow column and slip to your doom . " Butt was also " dubious of the number of credits , " stating that the game may seem easy with unlimited continues , but that the player will " still need considerable skill to reach the treasure island . " Buchanan was disappointed with the lack of speech samples , explaining that it 's " a bit of a drag with a character that is so defined by his voice . " Ultimately , Butt said that " [ y ] ounger players will instantly be enthralled by Donald 's quest " and that " QuackShot is everything a cartoon game should be and more . " Buchanan summed up the game as being a " good platformer tripped up by some questionable controls " and recommended the game as " a mildly enjoyable 16 @-@ bit platformer that would fit nicely in your Genesis collection . " + + + = Olmec colossal heads = + + The Olmec colossal heads are at least seventeen monumental stone representations of human heads sculpted from large basalt boulders . The heads date from at least before 900 BC and are a distinctive feature of the Olmec civilization of ancient Mesoamerica . All portray mature men with fleshy cheeks , flat noses , and slightly crossed eyes ; their physical characteristics correspond to a type that is still common among the inhabitants of Tabasco and Veracruz . The backs of the monuments often are flat . The boulders were brought from the Sierra de los Tuxtlas mountains of Veracruz . Given that the extremely large slabs of stone used in their production were transported over large distances , requiring a great deal of human effort and resources , it is thought that the monuments represent portraits of powerful individual Olmec rulers . Each of the known examples has a distinctive headdress . The heads were variously arranged in lines or groups at major Olmec centres , but the method and logistics used to transport the stone to these sites remain unclear . + The discovery of a colossal head at Tres Zapotes in the nineteenth century spurred the first archaeological investigations of Olmec culture by Matthew Stirling in 1938 . Seventeen confirmed examples are known from four sites within the Olmec heartland on the Gulf Coast of Mexico . Most colossal heads were sculpted from spherical boulders but two from San Lorenzo Tenochtitlán were re @-@ carved from massive stone thrones . An additional monument , at Takalik Abaj in Guatemala , is a throne that may have been carved from a colossal head . This is the only known example from outside the Olmec heartland . + Dating the monuments remains difficult because of the movement of many from their original contexts prior to archaeological investigation . Most have been dated to the Early Preclassic period ( 1500 – 1000 BC ) with some to the Middle Preclassic ( 1000 – 400 BC ) period . The smallest weigh 6 tons , while the largest is variously estimated to weigh 40 to 50 tons , although it was abandoned and left unfinished close to the source of its stone . + + = = Olmec civilization = = + + The Olmec civilization developed in the lowlands of southeastern Mexico between 1500 and 400 BC . The Olmec heartland lies on the Gulf Coast of Mexico within the states of Veracruz and Tabasco , an area measuring approximately 275 kilometres ( 171 mi ) east to west and extending about 100 kilometres ( 62 mi ) inland from the coast . The Olmecs are regarded as the first civilization to develop in Mesoamerica and the Olmec heartland is one of six cradles of civilization worldwide , the others being the Norte Chico culture of South America , the Erlitou culture of China 's Yellow River , the Indus Valley Civilization of south Asia , the civilization of ancient Egypt and the Sumerian civilization of ancient Iraq . Of these , only the Olmec civilization developed in a lowland tropical forest setting . + The Olmecs were the first inhabitants of the Americas to construct monumental architecture and to settle in towns and cities . They were also the first people in the Americas to develop a sophisticated style of stone sculpture . In the first decade of the 21st century evidence emerged of Olmec writing , with the earliest examples of Olmec hieroglyphs dating to around 650 BC . Examples of script have been found on roller stamps and stone artefacts ; the texts are short and have been partially deciphered based on their similarity to other Mesoamerican scripts . The evidence of complex society developing in the Olmec heartland has led to the Olmecs being regarded as the " Mother Culture " of Mesoamerica , although this concept remains controversial . + Some of the Olmecs ' rulers seem to have served religious functions . The city of San Lorenzo was succeeded as the main centre of the civilization by La Venta in about 900 BC , with Tres Zapotes and Laguna de los Cerros possibly sharing the role ; other urban centres were much less significant . The nature and degree of the control exercised by the centres over a widespread rural population remains unclear . Very fine Olmec art , much clearly made for an elite , survives in several forms , notably Olmec figurines , and larger sculptures such as The Wrestler . The figurines have been recovered in large numbers and are mostly in pottery ; these were presumably widely available to the population . Together with these , of particular relevance to the colossal heads are the " Olmec @-@ style masks " in stone , so called because none have yet been excavated in circumstances that allow the proper archaeological identification of an Olmec context . These evocative stone face masks present both similarities and differences to the colossal heads . Two thirds of Olmec monumental sculpture represents the human form , and the colossal heads fall within this major theme of Olmec art . + + = = Dating = = + + The colossal heads cannot be precisely dated . However , the San Lorenzo heads were buried by 900 BC , indicating that their period of manufacture and use was earlier still . The heads from Tres Zapotes had been moved from their original context before they were investigated by archaeologists and the heads from La Venta were found partially exposed on the modern ground surface . The period of production of the colossal heads is therefore unknown , as is whether it spanned a century or a millennium . Estimates of the time span during which colossal heads were produced vary from 50 to 200 years . The San Lorenzo heads are believed to be the oldest , and are the most skillfully executed . All of the stone heads have been assigned to the Preclassic period of Mesoamerican chronology , generally to the Early Preclassic ( 1500 – 1000 BC ) , although the two Tres Zapotes heads and the La Cobata Head are attributed to the Middle Preclassic ( 1000 – 400 BC ) . + + = = Characteristics = = + + Olmec colossal heads vary in height from 1 @.@ 47 to 3 @.@ 4 metres ( 4 @.@ 8 to 11 @.@ 2 ft ) and weigh between 6 and 50 tons . All of the Olmec colossal heads depict mature men with flat noses and fleshy cheeks ; the eyes tend to be slightly crossed . The general physical characteristics of the heads are of a type that is still common among people in the Olmec region in modern times . The backs of the heads are often flat , as if the monuments were originally placed against a wall . All examples of Olmec colossal heads wear distinctive headdresses that probably represent cloth or animal hide originals . Some examples have a tied knot at the back of the head , and some are decorated with feathers . A head from La Venta is decorated with the head of a bird . There are similarities between the headdresses on some of the heads that has led to speculation that specific headdresses may represent different dynasties , or perhaps identify specific rulers . Most of the heads wear large earspools inserted into the ear lobes . + All of the heads are realistic , unidealised and frank descriptions of the men . It is likely that they were portraits of living ( or recently deceased ) rulers well known to the sculptors . Each head is distinct and naturalistic , displaying individualised features . They were once thought to represent ballplayers although this theory is no longer widely held ; it is possible , however , that they represent rulers equipped for the Mesoamerican ballgame . Facial expressions depicted on the heads vary from stern through placid to smiling . The most naturalistic Olmec art is the earliest , appearing suddenly without surviving antecedents , with a tendency towards more stylised sculpture as time progressed . Some surviving examples of wooden sculpture recovered from El Manatí demonstrate that the Olmecs are likely to have created many more perishable sculptures than works sculpted from stone . + In the late nineteenth century , José Melgar y Serrano described a colossal head as having " Ethiopian " features and speculations that the Olmec had African origins resurfaced in 1960 in the work of Alfonso Medellín Zenil and in the 1970s in the writings of Ivan van Sertima . Such speculation is not taken seriously by Mesoamerican scholars such as Richard Diehl and Ann Cyphers . + Although all the colossal heads are broadly similar , there are distinct stylistic differences in their execution . One of the heads from San Lorenzo bears traces of plaster and red paint , suggesting that the heads were originally brightly decorated . Heads did not just represent individual Olmec rulers ; they also incorporated the very concept of rulership itself . + + = = Manufacture = = + + The production of each colossal head must have been carefully planned , given the effort required to ensure the necessary resources were available ; it seems likely that only the more powerful Olmec rulers were able to mobilise such resources . The workforce would have included sculptors , labourers , overseers , boatmen , woodworkers and other artesans producing the tools to make and move the monument , in addition to the support needed to feed and otherwise attend to these workers . The seasonal and agricultural cycles and river levels needed to have been taken into account to plan the production of the monument and the whole project may well have taken years from beginning to end . + Archaeological investigation of Olmec basalt workshops suggest that the colossal heads were first roughly shaped using direct percussion to chip away both large and small flakes of stone . The sculpture was then refined by retouching the surface using hammerstones , which were generally rounded cobbles that could be of the same basalt as the monument itself , although this was not always the case . Abrasives were found in association with workshops at San Lorenzo , indicating their use in the finishing of fine detail . Olmec colossal heads were fashioned as in @-@ the @-@ round monuments with varying levels of relief on the same work ; they tended to feature higher relief on the face and lower relief on the earspools and headdresses . Monument 20 at San Lorenzo is an extensively damaged throne with a figure emerging from a niche . Its sides were broken away and it was dragged to another location before being abandoned . It is possible that this damage was caused by the initial stages of re @-@ carving the monument into a colossal head but that the work was never completed . + All seventeen of the confirmed heads in the Olmec heartland were sculpted from basalt mined in the Sierra de los Tuxtlas mountains of Veracruz . Most were formed from coarse grained dark grey basalt known as Cerro Cintepec basalt after a volcano in the range . Investigators have proposed that large Cerro Cintepec basalt boulders found on the southeastern slopes of the mountains are the source of the stone for the monuments . These boulders are found in an area affected by large lahars ( volcanic mudslides ) that carried substantial blocks of stone down the mountain slopes , which suggests that the Olmecs did not need to quarry the raw material for sculpting the heads . Roughly spherical boulders were carefully selected to mimic the shape of a human head . The stone for the San Lorenzo and La Venta heads was transported a considerable distance from the source . The La Cobata head was found on El Vigia hill in the Sierra de los Tuxtlas and the stone from Tres Zapotes Colossal Head 1 and Nestepe Colossal Head 1 ( also known as Tres Zapotes Monuments A and Q ) came from the same hill . + The boulders were transported over 150 kilometres ( 93 mi ) from the source of the stone . The exact method of transportation of such large masses of rock are unknown , especially since the Olmecs lacked beasts of burden and functional wheels , and they were likely to have used water transport whenever possible . Coastal currents of the Gulf of Mexico and in river estuaries might have made the waterborne transport of monuments weighing 20 tons or more impractical . Two badly damaged Olmec sculptures depict rectangular stone blocks bound with ropes . A largely destroyed human figure rides upon each block , with their legs hanging over the side . These sculptures may well depict Olmec rulers overseeing the transport of the stone that would be fashioned into their monuments . When transport over land was necessary , the Olmecs are likely to have used causeways , ramps and roads to facilitate moving the heads . The regional terrain offers significant obstacles such as swamps and floodplains ; avoiding these would have necessitated crossing undulating hill country . The construction of temporary causeways using the suitable and plentiful floodplain soils would have allowed a direct route across the floodplains to the San Lorenzo Plateau . Earth structures such as mounds , platforms and causeways upon the plateau demonstrate that the Olmec possessed the necessary knowledge and could commit the resources to build large @-@ scale earthworks . + The flat backs of many of the colossal heads represented the flat bases of the monumental thrones from which they were reworked . Only four of the seventeen heartland heads do not have flattened backs , indicating the possibility that the majority were reworked monuments . Alternatively , the backs of many of these massive monuments may have been flattened to ease their transport , providing a stable form for hauling the monuments with ropes . Two heads from San Lorenzo have traces of niches that are characteristic of monumental Olmec thrones and so were definitely reworked from earlier monuments . + + = = Known monuments = = + + Seventeen confirmed examples are known . An additional monument , at Takalik Abaj in Guatemala , is a throne that may have been carved from a colossal head . This is the only known example outside of the Olmec heartland on the Gulf Coast of Mexico . Possible fragments of additional colossal heads have been recovered at San Lorenzo and at San Fernando in Tabasco . Crude colossal stone heads are also known in the Southern Maya area where they are associated with the potbelly style of sculpture . Although some arguments have been made that they are pre @-@ Olmec , these latter monuments are generally believed to be influenced by the Olmec style of sculpture . + + = = = San Lorenzo = = = + + The ten colossal heads from San Lorenzo originally formed two roughly parallel lines running north @-@ south across the site . Although some were recovered from ravines , they were found close to their original placements and had been buried by local erosion . These heads , together with a number of monumental stone thrones , probably formed a processional route across the site , powerfully displaying its dynastic history . Two of the San Lorenzo heads had been re @-@ carved from older thrones . + San Lorenzo Colossal Head 1 ( also known as San Lorenzo Monument 1 ) was lying facing upwards when excavated . The erosion of a path passing on top of the monument uncovered its eye and led to the discovery of the Olmec site . Colossal Head 1 is 2 @.@ 84 metres ( 9 @.@ 3 ft ) high ; it measures 2 @.@ 11 metres ( 6 @.@ 9 ft ) wide and it weighs 25 @.@ 3 tons . The monument was discovered partially buried at the edge of a gully by Matthew Stirling in 1945 . When discovered it was lying on its back , looking upwards . It was associated with a large number of broken ceramic vessels and figurines . The majority of these ceramic remains have been dated to between 800 and 400 BC ; some pieces have been dated to the Villa Alta phase ( Late Classic period , 800 – 1000 AD ) . The headdress possesses a plain band that is tied at the back of the head . The upper portion of the headdress is decorated with a U @-@ shaped motif . This element descends across the front of the headdress , terminating on the forehead . On the front portion it is decorated with five semicircular motifs . The scalp piece does not meet the horizontal band , leaving a space between the two pieces . On each side of the face a strap descends from the headdress and passes in front of the ear . The forehead is wrinkled in a frown . The lips are slightly parted without revealing the teeth . The cheeks are pronounced and the ears are particularly well executed . The face is slightly asymmetric , which may be due to error on the part of the sculptors or may accurately reflect the physical features of the portrait 's subject . The head has been moved to the Museo de Antropología de Xalapa ( " Anthropological Museum of Xalapa " ) . + San Lorenzo Colossal Head 2 ( also known as San Lorenzo Monument 2 ) was reworked from a monumental throne . The head stands 2 @.@ 69 metres ( 8 @.@ 8 ft ) high and measures 1 @.@ 83 metres ( 6 @.@ 0 ft ) wide by 1 @.@ 05 metres ( 3 @.@ 4 ft ) deep ; it weighs 20 tons . Colossal Head 2 was discovered in 1945 when Matthew Stirling 's guide cleared away some of the vegetation and mud that covered it . The monument was found lying on its back , facing the sky , and was excavated in 1946 by Stirling and Philip Drucker . In 1962 the monument was removed from the San Lorenzo plateau in order to put it on display as part of " The Olmec tradition " exhibition at the Museum of Fine Arts in Houston in 1963 . San Lorenzo Colossal Head 2 is currently in the Museo Nacional de Antropología in Mexico City . The head was associated with a number of ceramic finds ; they have been dated to the Early Preclassic and Late Classic periods . Colossal Head 2 wears a complex headdress that sports a horizontal band tied at the back of the head ; this is decorated with three bird 's heads that are located above the forehead and temples . The scalp piece is formed from six strips running towards the back of the head . The front of the headdress above the horizontal band is plain . Two short straps hang down from the headdress in front of the ears . The ear jewellery is formed by large squared hoops or framed discs . The left and right ornaments are different , with radial lines on the left earflare , a feature absent on the right earflare . The head is badly damaged due to an unfinished reworking process . This process has pitmarked the entire face with at least 60 smaller hollows and 2 larger holes . The surviving features appear to depict an ageing man with the forehead creased into a frown . The lips are thick and slightly parted to reveal the teeth ; the head has a pronounced chin . + San Lorenzo Colossal Head 3 is also known as San Lorenzo Monument 3 . The head measures 1 @.@ 78 metres ( 5 @.@ 8 ft ) high by 1 @.@ 63 metres ( 5 @.@ 3 ft ) wide by 0 @.@ 95 metres ( 3 @.@ 1 ft ) deep and weighs 9 @.@ 4 tons . The head was discovered in a deep gully by Matthew Stirling in 1946 ; it was found lying face down and its excavation was difficult due to the wet conditions in the gully . The monument was found 0 @.@ 8 kilometres ( 0 @.@ 50 mi ) southwest of the main mound at San Lorenzo , however , its original location is unknown ; erosion of the gully may have resulted in significant movement of the sculpture . Head 3 has been moved to the Museo de Antropología de Xalapa . The headdress is complex , with the horizontal basal band being formed by four horizontal cords , with diagonal folds above each eye . A small skullcap tops the headdress . A large flap formed of four cords drops down both sides of the head , completely covering the ears . The face has a typically frowning brow and , unusually , has clearly defined eyelids . The lips are thick and slightly parted ; the front of the lower lip has broken away completely , and the lower front of the headdress is pitted with 27 irregularly spaced artificial depressions . + San Lorenzo Colossal Head 4 ( also known as San Lorenzo Monument 4 ) weighs 6 tons and has been moved to the Museo de Antropología de Xalapa . Colossal Head 4 is 1 @.@ 78 metres ( 5 @.@ 8 ft ) high , 1 @.@ 17 metres ( 3 @.@ 8 ft ) wide and 0 @.@ 95 metres ( 3 @.@ 1 ft ) deep . The head was discovered by Matthew Stirling in 1946 , 550 metres ( 600 yd ) northwest of the principal mound , at the edge of a gully . When excavated , it was found to be lying on its right @-@ hand side and in a very good state of preservation . Ceramic materials excavated with the head became mixed with ceramics associated with Head 5 , making ceramic dating of the monument difficult . The headdress is decorated with a horizontal band formed of four sculpted cords , similar to those of Head 3 . On the right @-@ hand side , three tassels descend from the upper portion of the headdress ; they terminate in a total of eight strips that hang down across the horizontal band . These tassels are judged to represent hair rather than cords . Also on the right hand side , two cords descend across the ear and continue to the base of the monument . On the left @-@ hand side , three vertical cords descend across the ear . The earflare is only visible on the right hand side ; it is formed of a plain disc and peg . The face is that of an ageing man with a creased forehead , low cheekbones and a prominent chin . The lips are thick and slightly parted . + San Lorenzo Colossal Head 5 is also known as San Lorenzo Monument 5 . The monument stands 1 @.@ 86 metres ( 6 @.@ 1 ft ) high and measures 1 @.@ 47 metres ( 4 @.@ 8 ft ) wide by 1 @.@ 15 metres ( 3 @.@ 8 ft ) deep . It weighs 11 @.@ 6 tons . The head was discovered by Matthew Stirling in 1946 , face down in a gully to the south of the principal mound . The head is particularly well executed and is likely to have been found close to its original location . Ceramics recovered during its excavation became mixed with those from the excavation of Head 4 . The mixed ceramics have been dated to the San Lorenzo and Villa Alta phases ( approximately 1400 – 1000 BC and 800 – 1000 AD respectively ) . Colossal Head 5 is particularly well preserved , although the back of the headdress band was damaged when the head was moved from the archaeological site . The band of the headdress is set at an angle and has a notch above the bridge of the nose . The headdress is decorated with jaguar paws ; this general identification of the decoration is contested by Beatriz de la Fuente since the " paws " have three claws each ; she identifies them as the claws of a bird of prey . At the back of the head , ten interlaced strips form a net decorated with disc motifs . Two short straps descend from the headdress in front of the ears . The ears are adorned with disc @-@ shaped earspools with pegs . The face is that of an ageing man with wrinkles under the eyes and across the bridge of the nose , and a forehead that is creased in a frown . The lips are slightly parted . Colossal Head 5 has been moved to the Museo de Antropología de Xalapa . + San Lorenzo Colossal Head 6 ( also known as San Lorenzo Monument 17 ) is one of the smaller examples of colossal heads , standing 1 @.@ 67 metres ( 5 @.@ 5 ft ) . It measures 1 @.@ 41 metres ( 4 @.@ 6 ft ) wide by 1 @.@ 26 metres ( 4 @.@ 1 ft ) deep and is estimated to weigh between 8 and 10 tons . The head was discovered by a local farmworker and was excavated in 1965 by Luis Aveleyra and Román Piña Chan . The head had collapsed into a ravine under its own weight and was found face down on its left hand side . In 1970 it was transported to the Metropolitan Museum of Art in New York for the museum 's centenary exhibition . After its return to Mexico , it was placed in the Museo Nacional de Antropología in Mexico City . It is sculpted with a net @-@ like head covering joined together with sculpted beads . A covering descends from under the headdress to cover the back half of the neck . The headband is divided into four strips and begins above the right ear , extending around the entire head . A short strap descends from either side of the head to the ear . The ear ornaments are complex and are larger at the front of the ear than at the back . The face is that of an ageing male with the forehead creased in a frown , wrinkles under the eyes , sagging cheeks and deep creases on either side of the nose . The face is somewhat asymmetric , possibly due to errors in the execution of the monument . + San Lorenzo Colossal Head 7 ( also known as San Lorenzo Monument 53 ) measures 2 @.@ 7 metres ( 8 @.@ 9 ft ) high by 1 @.@ 85 metres ( 6 @.@ 1 ft ) wide by 1 @.@ 35 metres ( 4 @.@ 4 ft ) deep and weighs 18 tons . San Lorenzo Colossal Head 7 was reworked from a monumental throne ; it was discovered by a joint archaeological project by the Instituto Nacional de Antropología e Historia and Yale University , as a result of a magnetometer survey . It was buried at a depth of less than 1 metre ( 3 @.@ 3 ft ) and was lying facing upwards , leaning slightly northwards on its right hand side . The head is poorly preserved and has suffered both from erosion and deliberate damage . The headdress is decorated with a pair of human hands ; a feathered ornament is carved at the back of the headband and two discs adorn the front . A short strap descends from the headband and hangs in front of the right ear . The head sports large earflares that completely cover the earlobes , although severe erosion makes their exact form difficult to distinguish . The face has wrinkles between the nose and cheeks , sagging cheeks and deep @-@ set eyes ; the lips are badly damaged and the mouth is open , displaying the teeth . In 1986 the head was transported to the Museo de Antropología de Xalapa . + San Lorenzo Colossal Head 8 ( also known as San Lorenzo Monument 61 ) stands 2 @.@ 2 metres ( 7 @.@ 2 ft ) high ; it measures 1 @.@ 65 metres ( 5 @.@ 4 ft ) wide by 1 @.@ 6 metres ( 5 @.@ 2 ft ) deep and weighs 13 tons . It is one of the finest examples of an Olmec colossal head . It was found lying on its side to the south of a monumental throne . The monument was discovered at a depth of 5 metres ( 16 ft ) during a magnetometer survey of the site in 1968 ; it has been dated to the Early Preclassic . After discovery it was initially reburied ; it was moved to the Museo de Antropología de Xalapa in 1986 . The headdress is decorated with the talons or claws of either a jaguar or an eagle . It has a headband and a cover that descends from under the headdress proper behind the ears . Two short straps descend in front of the ears . The head sports large ear ornaments in the form of pegs . The face is that of a mature male with sagging cheeks and wrinkles between these and the nose . The forehead is gathered in a frown . The mouth is slightly parted to reveal the teeth . Most of the head is carved in a realistic manner , the exception being the ears . These are stylised and represented by one question mark shape contained within another . The head is very well preserved and displays a fine finish . + San Lorenzo Colossal Head 9 is also known as San Lorenzo Monument 66 . It measures 1 @.@ 65 metres ( 5 @.@ 4 ft ) high by 1 @.@ 36 metres ( 4 @.@ 5 ft ) wide by 1 @.@ 17 metres ( 3 @.@ 8 ft ) deep . The head was exposed in 1982 by erosion of the gullies at San Lorenzo ; it was found leaning slightly on its right hand side and facing upwards , half covered by the collapsed side of a gully and washed by a stream . Although it was documented by archaeologists , it remained for some time in its place of discovery before being moved to the Museo de Antropología de Xalapa . The headdress is of a single piece without a distinct headband . The sides display features that are possibly intended to represent long hair trailing to the bottom of the monument . The earflares are rectangular plates with an additional trapezoid element at the front . The head is also depicted wearing a nose @-@ ring . The face is smiling and has wrinkles under the eyes and at the edge of the mouth . It has sagging cheeks and wide eyes . The mouth is closed and the upper lip is badly damaged . The sculpture suffered some mutilation in antiquity , with nine pits hollowed into the face and headdress . + San Lorenzo Colossal Head 10 ( also known as San Lorenzo Monument 89 ) has been moved to the Museo Comunitario de San Lorenzo Tenochtitlán near Texistepec . It stands 1 @.@ 8 metres ( 5 @.@ 9 ft ) tall and measures 1 @.@ 43 metres ( 4 @.@ 7 ft ) wide by 0 @.@ 92 metres ( 3 @.@ 0 ft ) deep ; it weighs 8 tons . The head was discovered by a magnetometer survey in 1994 ; it was found buried , lying face upwards in the bottom of a ravine and was excavated by Ann Cyphers . The headdress is formed of 92 circular beads that completely cover the upper part of the head and descend across the sides and back . Above the forehead is a large element forming a three @-@ toed foot with long nails , possibly the foot of a bird . The head wears large earspools that protrude beyond the beads of the headdress . The spools have the form of a rounded square with a circular sunken central portion . The face is that of a mature man with the mouth closed , sagging cheeks and lines under the eyes . The mouth is sensitively carved and the head possesses a pronounced chin . + + = = = La Venta = = = + + Three of the La Venta heads were found in a line running east @-@ west in the northern Complex I ; all three faced northwards , away from the city centre . The other head was found in Complex B to the south of the Great Pyramid , in a plaza that included a number of other sculptures . The latter , the first of the La Venta heads to be discovered , was found during archaeological exploration of La Venta in 1925 ; the other three remained unknown to archaeologists until a local boy guided Matthew Stirling to them while he was excavating the first head in 1940 . They were located approximately 0 @.@ 9 kilometres ( 0 @.@ 56 mi ) to the north of Monument 1 . + La Venta Monument 1 is speculated to have been the portrait of La Venta 's final ruler . Monument 1 measures 2 @.@ 41 metres ( 7 @.@ 9 ft ) high by 2 @.@ 08 metres ( 6 @.@ 8 ft ) wide by 1 @.@ 95 metres ( 6 @.@ 4 ft ) deep ; it weighs 24 tons . The front of the headdress is decorated with three motifs that apparently represent the claws or fangs of an animal . Above these symbols is an angular U @-@ shaped decoration descending from the scalp . On each side of the monument a strap descends from the headdress , passing in front of the ear . Each ear has a prominent ear ornament that descends from the earlobe to the base of the monument . The features are those of a mature man , with wrinkles around the mouth , eyes and nose . Monument 1 is the best preserved head at La Venta but has suffered from erosion , particularly at the back . The head was first described by Franz Blom and Oliver La Farge who investigated the La Venta remains on behalf of Tulane University in 1925 . When discovered it was half @-@ buried ; its massive size meant that the discoverers were unable to excavate it completely . Matthew Stirling fully excavated the monument in 1940 , after clearing the thick vegetation that had covered it in the intervening years . Monument 1 has been moved to the Parque @-@ Museo La Venta in Villahermosa . The head was found in its original context ; associated finds have been radiocarbon dated to between 1000 and 600 BC . + La Venta Monument 2 measures 1 @.@ 63 metres ( 5 @.@ 3 ft ) high by 1 @.@ 35 metres ( 4 @.@ 4 ft ) wide by 0 @.@ 98 metres ( 3 @.@ 2 ft ) deep ; the head weighs 11 @.@ 8 tons . The face has a broadly smiling expression that reveals four of the upper teeth . The cheeks are given prominence by the action of smiling ; the brow that is normally visible in other heads is covered by the rim of the headdress . The face is badly eroded , distorting the features . In addition to the severe erosion damage , the upper lip and a part of the nose have been deliberately mutilated . The head was found in its original context a few metres north of the northwest corner of pyramid @-@ platform A @-@ 2 . Radiocarbon dating of the monument 's context dates it to between 1000 and 600 BC . Monument 2 has suffered erosion damage from its exposure to the elements prior to discovery . The head has a prominent headdress but this is badly eroded and any individual detail has been erased . A strap descends in front of the ear on each side of the head , descending as far as the earlobe . The head is adorned with ear ornaments in the form of a disc that covers the earlobe , with an associated clip or peg . The surviving details of the headdress and earflares are stylistically similar to those of Tres Zapotes Monument A. The head has been moved to the Museo del Estado de Tabasco in Villahermosa . + La Venta Monument 3 stands 1 @.@ 98 metres ( 6 @.@ 5 ft ) high and measures 1 @.@ 6 metres ( 5 @.@ 2 ft ) wide by 1 metre ( 3 @.@ 3 ft ) deep ; it weighs 12 @.@ 8 tons . Monument 3 was located a few metres to the east of Monument 2 , but was moved to the Parque @-@ Museo La Venta in Villahermosa . Like the other La Venta heads , its context has been radiocarbon dated to between 1000 and 600 BC . It appears unfinished and has suffered severe damage through weathering , making analysis difficult . It had a large headdress that reaches to the eyebrows but any details have been lost through erosion . Straps descend in front of each ear and continue to the base of the monument . The ears are wearing large flattened rings that overlap the straps ; they probably represent jade ornaments of a type that have been recovered in the Olmec region . Although most of the facial detail is lost , the crinkling of the bridge of the nose is still evident , a feature that is common to the frowning expressions of the other Olmec colossal heads . + La Venta Monument 4 measures 2 @.@ 26 metres ( 7 @.@ 4 ft ) high by 1 @.@ 98 metres ( 6 @.@ 5 ft ) wide and 1 @.@ 86 metres ( 6 @.@ 1 ft ) deep . It weighs 19 @.@ 8 tons . It was found a few metres to the west of Monument 2 and has been moved to the Parque @-@ Museo La Venta . As with the other heads in the group , its archaeological context has been radiocarbon dated to between 1000 and 600 BC . The headdress is elaborate and , although damaged , various details are still discernible . The base of the headdress is formed by three horizontal strips running over the forehead . One side is decorated with a double @-@ disc motif that may have been repeated on the other ; if so , damage to the right side has obliterated any trace of it . The top of the headdress is decorated with the clawed foot of a bird of prey . Either straps or plaits of hair descend on either side of the face , from the headdress to the base of the monument . Only one earspool survives ; it is flat , in the form of a rounded square , and is decorated with a cross motif . The ears have been completely eroded away and the lips are damaged . The surviving features display a frown and creasing around the nose and cheeks . The head displays prominent teeth . + + = = = Tres Zapotes = = = + + The two heads at Tres Zapotes , with the La Cobata head , are stylistically distinct from the other known examples . Beatriz de la Fuente views them as a late regional survival of an older tradition while other scholars argue that they are merely the kind of regional variant to be expected in a frontier settlement . These heads are sculpted with relatively simple headdresses ; they have squat , wide proportions and distinctive facial features . The two Tres Zapotes heads are the earliest known stone monuments from the site . The discovery of one of the Tres Zapotes heads in the nineteenth century led to the first archaeological investigations of Olmec culture , carried out by Matthew Stirling in 1938 . + Tres Zapotes Monument A ( also known as Tres Zapotes Colossal Head 1 ) was the first colossal head to be found , discovered by accident in the middle of the nineteenth century , 1 kilometre ( 0 @.@ 62 mi ) to the north of the modern village of Tres Zapotes . After its discovery it remained half @-@ buried until it was excavated by Matthew Stirling in 1939 . At some point it was moved to the plaza of the modern village , probably in the early 1960s . It has since been moved to the Museo Comunitario de Tres Zapotes . Monument A stands 1 @.@ 47 metres ( 4 @.@ 8 ft ) tall ; it measures 1 @.@ 5 metres ( 4 @.@ 9 ft ) wide by 1 @.@ 45 metres ( 4 @.@ 8 ft ) deep , and is estimated to weigh 7 @.@ 8 tons . The head is sculpted with a simple headdress with a wide band that is otherwise unadorned , and wears rectangular ear ornaments that project forwards onto the cheeks . The face is carved with deep creases between the cheeks and the nose and around the mouth ; the forehead is creased into a frown . The upper lip has suffered recent damage , with the left portion flaking away . + Tres Zapotes Monument Q ( also known as the Nestape Head and Tres Zapotes Colossal Head 2 ) measures 1 @.@ 45 metres ( 4 @.@ 8 ft ) high by 1 @.@ 34 metres ( 4 @.@ 4 ft ) wide by 1 @.@ 26 metres ( 4 @.@ 1 ft ) deep and weighs 8 @.@ 5 tons . Its exact date of discovery is unknown but is estimated to have been some time in the 1940s , when it was struck by machinery being used to clear vegetation from Nestape hill . Monument Q was the eleventh colossal head to be discovered . It was moved to the plaza of Santiago Tuxtla in 1951 and remains there to this day . Monument Q was first described by Williams and Heizer in an article published in 1965 . The headdress is decorated with a frontal tongue @-@ shaped ornament , and the back of the head is sculpted with seven plaits of hair bound with tassels . A strap descends from each side of the headdress , passing over the ears and to the base of the monument . The face has pronounced creases around the nose , mouth and eyes . + + = = = La Cobata = = = + + The La Cobata region was the source of the basalt used for carving all of the colossal heads in the Olmec heartland . The La Cobata colossal head was discovered in 1970 and was the fifteenth to be recorded . It was discovered in a mountain pass in the Sierra de los Tuxtlas , on the north side of El Vigia volcano near to Santiago Tuxtla . The head was largely buried when found ; excavations uncovered a Late Classic ( 600 – 900 AD ) offering associated with the head consisting of a ceramic vessel and a 12 @-@ centimetre ( 4 @.@ 7 in ) long obsidian knife placed pointing northwards towards the head . The offering is believed to have been deposited long after the head was sculpted . The La Cobata head has been moved from its original location to the main plaza at Santiago . + The La Cobata head is more or less rounded and measures 3 by 3 metres ( 9 @.@ 8 by 9 @.@ 8 ft ) by 3 @.@ 4 metres ( 11 ft ) high , making it the largest known head . This massive sculpture is estimated to weigh 40 tons . It is stylistically distinct from the other examples , and Beatriz de la Fuente placed it late in the Olmec time frame . The characteristics of the sculpture have led to some investigators suggesting that it represents a deceased person . Norman Hammond argues that the apparent stylistic differences of the monument stem from its unfinished state rather than its late production . The eyes of the monument are closed , the nose is flattened and lacks nostrils and the mouth was not sculpted in a realistic manner . The headdress is in the form of a plain horizontal band . + The original location of the La Cobata head was not a major archaeological site and it is likely that the head was either abandoned at its source or during transport to its intended destination . Various features of the head suggest that it was unfinished , such as a lack of symmetry below the mouth and an area of rough stone above the base . Rock was not removed from around the earspools as on other heads , and does not narrow towards the base . Large parts of the monument seem to be roughed out without finished detail . The right hand earspool also appears incomplete ; the forward portion is marked with a sculpted line while the rear portion has been sculpted in relief , probably indicating that the right cheek and eye area were also unfinished . The La Cobata head was almost certainly carved from a raw boulder rather than being sculpted from a throne . + + = = = Takalik Abaj = = = + + Takalik Abaj Monument 23 dates to the Middle Preclassic period , and is found in Takalik Abaj , an important city in the foothills of the Guatemalan Pacific coast , in the modern department of Retalhuleu . It appears to be an Olmec @-@ style colossal head re @-@ carved into a niche figure sculpture . If originally a colossal head then it would be the only known example from outside the Olmec heartland . + Monument 23 is sculpted from andesite and falls in the middle of the size range for confirmed colossal heads . It stands 1 @.@ 84 metres ( 6 @.@ 0 ft ) high and measures 1 @.@ 2 metres ( 3 @.@ 9 ft ) wide by 1 @.@ 56 metres ( 5 @.@ 1 ft ) deep . Like the examples from the Olmec heartland , the monument features a flat back . Lee Parsons contests John Graham 's identification of Monument 23 as a re @-@ carved colossal head ; he views the side ornaments , identified by Graham as ears , as rather the scrolled eyes of an open @-@ jawed monster gazing upwards . Countering this , James Porter has claimed that the re @-@ carving of the face of a colossal head into a niche figure is clearly evident . + Monument 23 was damaged in the mid @-@ twentieth century by a local mason who attempted to break its exposed upper portion using a steel chisel . As a result , the top is fragmented , although the broken pieces were recovered by archaeologists and have been put back into place . + + = = Collections = = + + All of the 17 confirmed colossal heads remain in Mexico . Two heads from San Lorenzo are on permanent display at the Museo Nacional de Antropología in Mexico City . Seven of the San Lorenzo heads are on display in the Museo de Antropología de Xalapa . Five of them are in Sala 1 , one is in Sala 2 and one is in Patio 1 . The remaining San Lorenzo head is in the Museo Comunitario de San Lorenzo Tenochtitlán near Texistepec . All four heads from La Venta are now in Villahermosa , the state capital of Tabasco . Three are in the Parque @-@ Museo La Venta and one in the Museo del Estado de Tabasco . Two heads are on display in the plaza of Santiago Tuxtla ; one from Tres Zapotes and the La Cobata Head . The other Tres Zapotes head is in the Museo Comunitario de Tres Zapotes . + Several colossal heads have been loaned to temporary exhibitions abroad ; San Lorenzo Colossal Head 6 was loaned to the Metropolitan Museum of Art in New York in 1970 . San Lorenzo colossal heads 4 and 8 were lent to the Olmec Art of Ancient Mexico exhibition in the National Gallery of Art , Washington , D.C. that ran from 30 June to 20 October 1996 . San Lorenzo Head 4 was again loaned in 2005 , this time to the de Young Museum in San Francisco . The de Young Museum was loaned San Lorenzo colossal heads 5 and 9 for its Olmec : Colossal Masterworks of Ancient Mexico exhibition , which ran from 19 February to 8 May 2011 . + + = = = Vandalism = = = + + On 12 January 2009 , at least three people , including two Mexicans and one American , entered the Parque @-@ Museo La Venta in Villahermosa and damaged just under 30 archaeological pieces , including the four La Venta colossal heads . The vandals were all members of an evangelical church and appeared to have been carrying out a supposed pre @-@ Columbian ritual , during which salts , grape juice and oil were thrown on the heads . It was estimated that 300 @,@ 000 pesos ( US $ 21 @,@ 900 ) would be needed to repair the damage , and the restoration process would last four months . The three vandals were released soon after their arrest after paying 330 @,@ 000 pesos each . + + = = = Replicas = = = + + Although not all of the replicas were placed by him , the majority of replicas around the world were placed under the leadership of Miguel Alemán Velasco , former governor of the state of Veracruz . The following is a list of replicas and their locations within the United States : + Austin , Texas . A replica of San Lorenzo Head 1 was placed in the Teresa Lozano Long Institute of Latin American Studies at the University of Texas in November 2008 . + Chicago , Illinois . A replica of San Lorenzo Head 8 made by Ignacio Perez Solano was placed in the Field Museum of Natural History in 2000 . + Covina , California . A replica of San Lorenzo Head 5 was donated to Covina in 1989 , originally intended to be placed in Jalapa Park . Due to concerns over potential vanadalism it was instead installed outside the police station . It was removed in 2011 and relocated to Jobe 's Glen , Jalapa Park in June 2012 . + McAllen , Texas . A replica of San Lorenzo Head 8 is located in the International Museum of Art & Science . The specific date placement is unknown , but it was dedicated by Fidel Herrera Beltrán , then governor of Veracruz , during his time in office between 2004 – 2010 . + New York . A replica of San Lorenzo Head 1 was placed next to the main plaza in the grounds of Lehman College in the Bronx , New York . It was installed in 2013 to celebrate the first anniversary of the CUNY Institute of Mexican Studies , housed at the college . The replica was a gift by the government of Verazruz state , Cumbre Tajín and Mexico Trade ; it was first plazed in Dag Hammerskjold Park , outside the United Nations , in 2012 . + San Francisco , California . A replica of San Lorenzo Head 1 created by Ignacio Perez Solano was placed in San Francisco City College , Ocean Campus in October 2004 . + Washington , D.C. A replica of San Lorenzo Head 4 sculpted by Ignacio Perez Solano was placed near the Constitution Avenue entrance of the Smithsonian National Museum of Natural History in October 2001 . + West Valley City , Utah . A replica of San Lorenzo Head 8 was placed in the Utah Cultural Celebration Center in May 2004 . + Mexico donated a resin replica of an Olmec colossal head to Belgium ; it is on display in the Musée du Cinquantenaire in Brussels . + In February 2010 , the Mexican Secretaría de Relaciones Exteriores ( Secretariat of Foreign Affairs ) announced that the Instituto Nacional de Antropología e Historia would be donating a replica Olmec colossal head to Ethiopia , to be placed in Plaza Mexico in Addis Ababa . + + + = Brad Stevens = + + Bradley Kent " Brad " Stevens ( born October 22 , 1976 ) is an American professional basketball head coach for the Boston Celtics of the NBA . He was previously the head coach at Butler University in Indianapolis . A former basketball player , he grew up in Zionsville , Indiana , where he starred on the Zionsville Community High School basketball team , setting four school records . After high school , he attended DePauw University , where he played basketball and earned a degree in economics . He made the all @-@ conference team multiple times and was a three @-@ time Academic All @-@ America nominee . + Stevens joined the Butler basketball program as a volunteer prior to the 2000 – 01 season after quitting his job at Eli Lilly and Company . He was promoted to a full @-@ time assistant coaching position for the 2001 – 02 season . On April 4 , 2007 , he became the head coach after Todd Lickliter left to coach the Iowa Hawkeyes . In his first year , Stevens led Butler to 30 wins , becoming the third @-@ youngest head coach in NCAA Division I history to have a 30 @-@ win season . + In 2010 , his third year as head coach , Stevens broke the NCAA record for most wins in a coach 's first three years , exceeding the previous record by eight . In the postseason , Stevens coached Butler to the first Final Four in school history . At 33 years old , Stevens became the second @-@ youngest head coach to make a NCAA National Championship game , losing 61 – 59 to Duke . Shortly after the season ended , he signed a contract extension with Butler through the 2011 – 12 season . With the 2010 – 11 team making the Final Four , Stevens became the youngest coach to go to two Final Fours . Stevens coached the Bulldogs in their second consecutive national championship game on April 4 , 2011 , where the team lost to the Huskies of the University of Connecticut . + Stevens is known for a calm , focused coaching style . He spends a lot of time analyzing opponents using statistical analysis , adding new wrinkles to his team 's play each game . He puts a strong emphasis on defensive and team oriented basketball . Butler 's success against teams with superior athletes has been attributed to Stevens ' coaching style and calm demeanor . Stevens has twice been named the Horizon League Coach of the Year and won collegeinsider.com 's Hugh Durham Award mid @-@ season honors in January 2009 . He has also been both a Hugh Durham Award and Jim Phelan Award finalist all three years of his career . Stevens has been called a coaching prodigy and compared to John Wooden . He is married with two young children . In July 2013 , he signed a six @-@ year , 22 million dollar contract to become the head coach of the Boston Celtics in the NBA . In April 2015 , Stevens led the Celtics to the NBA Playoffs as the 7th seed in the Eastern Conference with a 40 – 42 record . + + = = Early life = = + + Bradley Kent Stevens grew up in Zionsville , Indiana , where he developed his love for basketball . Starting at age five , Stevens would watch taped basketball games " before he went to afternoon kindergarten " . His father would often drive him to Bloomington , to watch Indiana Hoosiers games . " It 's hard not to be [ in love with basketball ] when you 're a kid growing up in Indiana " , Stevens later said . + For his eighth birthday , Stevens received a new basketball hoop . " It ’ s so much fun to dream in your driveway , " he later remarked . " That ’ s where my friends and I hung out . It was a lot of fun to grow up in that era . " When a friend , Brandon Monk , had a basketball court installed in his back yard , Stevens " appeared instantaneously . " He was so dedicated to the game that he would bring the unprepared ingredients for grilled cheese sandwiches over to Monk 's house , so that he would not waste time waiting for the sandwiches to cook . + Monk 's court soon became a gathering place , where Zionsville kids and kids from the surrounding areas would hold pickup games . These games helped develop Stevens ' competitive streak . Besides playing basketball , the young Stevens also enjoyed solving puzzles , a skill he later applied to analyzing opposing teams to find their weaknesses . + Stevens attended Zionsville Community High School , where he became a star basketball player . He wore No. 31 in high school in honor of Indiana Pacers guard Reggie Miller . During his freshman year , he would get up early to practice shooting at a local gym before school . The hard work paid off as Stevens made the varsity team that same year . By the time his high school career was complete , Stevens had set school records for career scoring , assists , steals , and three @-@ point field goals . As of 2010 , he still holds the records for points ( 1508 ) , assists ( 444 ) , and steals ( 156 ) , as well as the single @-@ season points record ( 644 in 1995 ) . Stevens was named to the all @-@ conference team three times . In 1995 , he was named the sectional MVP and was the leading scorer in state sectional play ( 32 @.@ 3 ppg ) . + Stevens made the academic all @-@ state first team and received the Straight A Gold Medal Award all four years . He was a member of the National Honor Society , graduating seventh in his class of 165 . He earned three letters in basketball , three in track , and one in baseball during his days at Zionsville . During summers , he traveled the country playing AAU basketball . + Although Stevens had a strong passion for the game , he realized that his basketball skills were modest and not likely to get him very far . As such , he chose to attend academically oriented DePauw University for college . During his stay , he played in all 101 DePauw games , earning four varsity letters . He earned multiple all @-@ conference and academic all @-@ conference awards , and was a three @-@ time Academic All @-@ America nominee . He was a team captain his senior year , and averaged more than 8 points per game three of his four years . His career highs were 24 points and 8 rebounds in a game . After his senior year , Stevens received the Coaches ’ Award . Coach Bill Fenlon later described Stevens as " one of the most selfless , team @-@ oriented person [ sic ] I 've ever been around . " + While at DePauw , Stevens was a member of the Management Fellows Honors Program and the DePauw Community Services ’ Sports Night executive board . He was also a brother of the Alpha Tau Omega fraternity . During summer vacations , Stevens spent time teaching at Butler basketball camps . He was named to the Dean 's list and graduated in 1999 with a degree in economics . + + = = College career = = + + In the summer of 2000 , Stevens was offered the opportunity to volunteer in the Butler basketball office . He ran the idea of quitting his job at Eli Lilly by then @-@ longtime girlfriend Tracy Wilhelmy . She thought about it for two hours before telling him to go for it . " Now , it looks like a great idea , " Stevens later remarked . " At the time , I thought it was something I really wanted to try . " Tracy went back to school to get a law degree that could support the couple if things did not work out for Brad . " We were 23 and realized this was our chance , " Tracy later said . " Five years down the road , we were probably not going to be in a position to do that . The more success you had at Lilly , the harder it would be to leave . " + Stevens planned to live in a friend 's basement and took a job at Applebee ’ s to pay the bills . Before he started training at Applebee 's , he was offered a low @-@ paying administrative position as coordinator of basketball operations under then @-@ coach Thad Matta . The position had opened up when assistant coach Jamal Meeks resigned after being arrested on solicitation and drug charges , of which he was later acquitted . Years later , Matta recalled , " [ Stevens ] was just a hungry young kid that was desperate to get into coaching . He had a great passion and was willing to take a risk to get into the coaching profession . " + After Matta left the school following the 2000 – 01 season , new head coach Todd Lickliter promoted Stevens to a full @-@ time assistant coach . Under Lickliter , Stevens was active in every aspect of the game : skills instruction , game preparation , in @-@ game coaching , and recruiting . Butler was 131 – 61 during Stevens ' time as an assistant coach . + + = = = Named head coach = = = + + On April 2 , 2007 , Lickliter resigned in order to take the head @-@ coaching position at the University of Iowa . The Butler players had a meeting with athletic director Barry Collier , urging him to promote from within . Collier , having spent the entire season observing the assistant coaches ' interaction with the team , agreed . The day after Lickliter resigned Stevens and Butler 's two other assistant coaches interviewed for the job . Within 24 hours of the interviews Stevens was named Butler 's new head coach . According to Collier , Stevens had something older , outside candidates could never match : six years of experience learning the Butler system , dubbed " The Butler Way " by Collier . " Age wasn 't a factor because I 'd seen his ability shine through during the course of the season , " Collier said . + + = = = 2007 – 08 season = = = + + At the start of the 2007 – 08 season , Stevens was the second youngest coach in Division I basketball . He got off to a fast start , winning his first eight games before falling to Wright State 43 – 42 . Legendary coach Bob Knight , whose Texas Tech team was an early victim , said " I wish we played as smart as they do . " Virginia Tech coach Seth Greenberg added " they 've got toughness about them and they expect to win . " + Midway through Stevens ' first season , with the Bulldogs at 12 – 1 , The New York Times wrote " so far , Stevens has made the transition [ to head coach ] look easy . " The Times went on to state that Stevens had the calm and composure of a seasoned veteran . " You ’ ve got a lot of people always looking for the next step . And that ’ s not what I was doing . I was just trying to figure out a way to win the next game and think like a head coach . " Stevens said . + Butler ended the regular season with a 27 – 3 record , taking first place in the Horizon League with a 16 – 2 in conference mark . The team beat Illinois @-@ Chicago 66 – 50 and Cleveland State 70 – 55 to claim the league 's tournament title and an automatic bid to the 2008 NCAA tournament . Butler was awarded the seven seed in the East Regional . They beat tenth @-@ seeded South Alabama 81 – 61 in the first round , before falling to second @-@ seeded Tennessee 76 – 71 in overtime . + Stevens ended up with a school and Horizon league record 30 wins , beating several big name schools – Michigan , Texas Tech , Florida State , Ohio State – along the way . In so doing , he became the third @-@ youngest head coach in NCAA Division I history to lead a team to 30 wins in a season , and became the fourth @-@ winningest first @-@ year coach . Butler was nationally ranked for a school and league record 19 consecutive weeks . Butler 's 30 – 4 record was the best among teams that did not reach the Final Four . Stevens was a finalist for the Hugh Durham Award , losing to Keno Davis of Drake , and a finalist for the Jim Phelan National Coach of the Year Award , losing to Bo Ryan . + At the conclusion of the season , Butler signed Stevens to a seven @-@ year contract . " We are extremely excited to reach this long @-@ term agreement to have Brad continue to lead our program , " Collier remarked . + + = = = 2008 – 09 season = = = + + Butler lost four starters after the 2007 – 08 season , and was picked to finish fifth in the Horizon league during the 2008 – 09 season . The team got off to a 12 – 1 start that won Stevens the Hugh Durham mid @-@ season coaching award . On February 5 , Stevens notched his 50th win as Butler beat Detroit 66 – 61 . In so doing , Stevens became the sixth head coach in NCAA history to reach 50 wins in 56 games or fewer . Butler finished first in the Horizon League with a 15 – 3 in conference record , defying preseason expectations . Butler lost the Horizon League tournament final 57 – 54 to Cleveland State , but made the NCAA tournament as an at @-@ large selection . The team received the nine seed in the South Regional , and lost to eighth @-@ seeded Louisiana State in the first round by a score of 75 – 71 to finish the year at 26 – 6 overall . + Stevens ' 56 – 10 two @-@ year record places him second only to Bill Guthridge ( 58 ) in total wins during one 's first two years as head coach . Stevens was a finalist for both the Hugh Durham and Jim Phelan Awards for the second straight year and was named the Horizon League Coach of the Year . He was also named as a finalist for the Henry Iba Coach of the Year Award . Stevens was given a one @-@ year contract extension at the conclusion of the season . + + = = = 2009 – 10 season = = = + + Fueled in large part by Gordon Hayward 's and Shelvin Mack 's roles in leading Team USA to the gold medal in the FIBA Under @-@ 19 World Championship during the off @-@ season , Butler began the season ranked 10th in the Coaches ' Poll and 11th in the AP Poll . A few commentators picked the Bulldogs as a possible " sleeper team " to make the Final Four . Stevens was not so sure , privately telling his father , " We have a really good team , and I ’ m not sure how far we can go this year , but next year , we ought to go really far . " + Butler got off to a mediocre start , losing twice in the 76 Classic 82 – 73 to 22nd @-@ ranked Minnesota and to 19th @-@ ranked Clemson 70 – 69 . After the tournament Butler 's record stood at 4 – 2 and the team dropped to # 23 in the AP Poll and # 20 in the Coaches ' Poll . Butler won its next two games before falling to 13th @-@ ranked Georgetown 72 – 65 in the Jimmy V Classic . The team won its next two games beating # 15 Ohio State 74 – 66 and edging out former conference rival Xavier 69 – 68 , both at home . After losing 67 – 57 at UAB three days later , Butler stood at 9 – 4 and fell out of the AP rankings . However , the team remained in the Coaches Poll at # 23 . + Stevens rallied the team , and they proceeded to win 16 straight games before facing Siena in a BracketBusters game . Butler beat Siena 70 – 53 and Stevens tied the NCAA record for most wins ( 81 ) by a head coach in his first three seasons set by Mark Few of Gonzaga in 2002 and tied by Mark Fox of Nevada in 2007 . + On February 26 , 2010 , Butler traveled to Valparaiso for their regular season finale . Leading scorer Gordon Hayward was sidelined with lower back pain , but the team still won 74 – 69 . In so doing , Stevens broke the coaching record he had tied the prior week and Butler completed an 18 – 0 undefeated conference schedule . It was Butler 's first undefeated conference record since joining the Horizon League , and first since Joe Sexson led the 1978 team to a 6 – 0 record in the now defunct Indiana Collegiate Conference . Stevens earned his third straight regular @-@ season conference championship . + In the Horizon league tournament , Stevens ' Bulldogs used their home @-@ court advantage to beat Milwaukee 68 – 59 in the semi @-@ finals and to beat Wright State 70 – 45 in the finals . The win earned the team an automatic bid into the 2010 NCAA tournament , and completed a 20 – 0 run through league play . Stevens became the first coach to lead a Horizon League team to both an undefeated regular season and conference tournament since the league was formed in 1979 . Stevens was also the only coach in Division I to lead his team to an undefeated conference schedule during the 2009 – 10 season . + + = = = = NCAA tournament = = = = + + For their season , the Bulldogs were ranked 8th in the final pre @-@ NCAA tournament Coaches ' Poll and 11th in the corresponding AP Poll . On Selection Sunday , the Bulldogs were seeded fifth in the West regional of the NCAA tournament and given a first @-@ round match up with twelfth seeded UTEP on March 18 . + Many basketball commentators picked UTEP to pull the upset , and at halftime it looked like they might be right , as UTEP led 33 – 27 . Stevens made a number of halftime adjustments , and the Bulldogs came out firing on all cylinders in the second half . The team dominated the second half and won the game 77 – 59 . Butler next faced off with thirteenth seeded Murray State . The game was close throughout , but Butler emerged victorious 54 – 52 when Hayward deflected a Murray State pass into the back court with less than five seconds on the clock . The win gave Stevens the first Sweet Sixteen appearance of his career . + On March 25 , 2010 , Butler faced top @-@ seeded Syracuse . The Bulldogs got off to a good start , jumping out to a 12 – 1 lead and a 35 – 25 halftime advantage . Syracuse rallied in the second half , taking its first lead of the game , 40 – 39 , off a Wes Johnson three @-@ pointer . Stevens called timeout and Butler regained the lead on its next possession , stopping the run . At the 5 : 32 mark , Syracuse got a rare fast break opportunity that ended with a dunk and 54 – 50 lead . Stevens again called time out and re @-@ focused the team . Butler responded by holding Syracuse scoreless for the next 5 minutes , taking a 60 – 54 lead with 0 : 59 to go . Butler held on to win 63 – 59 , advancing to the Elite Eight for the first time in school history . + Two days later , Stevens ' Bulldogs met second @-@ seeded Kansas State in the regional finals . Perhaps feeling the effects of their double overtime 101 – 96 win two days prior , Kansas State got off to a slow start , scoring just 20 points in the first half to trail 27 – 20 . Butler kept the lead in the upper single digits for most of the second half , before Kansas State went on a 13 – 2 run and took a 52 – 51 lead . Stevens immediately called time out and re @-@ focused the team . " Play your game . Just play your game , " he told them . On the ensuing possession , Butler regained the lead for good . They outscored Kansas State 12 – 4 the rest of the way and won the game 63 – 56 . In the post game celebration , Stevens and walk @-@ on forward Emerson Kampen connected on a flying back @-@ bump that became one of the iconic images of the tournament . + The win earned the Bulldogs a trip back to Indianapolis for the first Final Four appearance in school and Horizon League history . The win made Stevens , at age 33 , the youngest coach to lead a team to the Final Four since Bob Knight made his first Final Four appearance at age 32 in 1973 . Butler became the smallest school ( enrollment 4 @,@ 200 ) to make the Final Four since seeding began in 1979 . + + = = = = = Final Four = = = = = + + On April 3 , Brad Stevens and the Butler Bulldogs faced off with Michigan State in the national semi @-@ finals . Michigan State took an early 14 – 7 lead , and Matt Howard got in early foul trouble , sitting most the first half . Stevens kept the team focused with a " next man up " attitude and the game was tied at 28 at halftime . The second half was dominated by tight defense for both sides . With 2 : 45 to go in the game , the score was 47 – 44 Butler . Michigan State called a time out to set up a play . Stevens correctly anticipated the play call and had Ronald Nored , the team 's best defender , switch onto Korie Lucious off a screen . Nored stole the ball and Shawn Vanzant got fouled on the resulting run out , hitting 1 of 2 . Trailing 50 – 49 with under 30 seconds remaining , Michigan State came up empty and was forced to foul . Nored hit both foul shots , giving Butler a 52 – 49 lead . After a Michigan State time out , Stevens had his team foul Lucious with 2 seconds remaining to prevent a potentially game tying 3 @-@ pointer . After making the first , Lucious intentionally missed the second free throw . Hayward came down with the rebound to seal the victory . Butler became the first team since the shot clock was adopted for the 1985 – 86 season to hold five straight tournament opponents under 60 points . + On April 5 , 2010 , Butler and Duke faced off in what The New York Times called " the most eagerly awaited championship game in years " . Late in the first half , Duke went on an 8 – 0 run to take a 26 – 20 lead . Stevens called a timeout . With starters Matt Howard and Ronald Nored on the bench in foul trouble , Stevens was forced to call on backup center Avery Jukes who came up big for Butler . Jukes scored 10 first half points , tying his season high . At half time , Duke 's lead stood at 33 – 32 . + The second half was played very closely , with neither team taking a substantial lead . With 3 : 16 to play , Duke took a 60 – 55 lead on two made free throws by Nolan Smith . Butler cut the lead to one point in the final minute and , after a missed Kyle Singler jump shot with 36 seconds remaining , got a chance to retake the lead . Butler was unable to initiate their offense and Stevens called a timeout to set up a play . A failed inbounds attempt and a timeout later , Hayward missed a baseline fade @-@ away jumper and Brian Zoubek came down with the rebound for Duke . He was quickly fouled with less than 4 seconds remaining . Hayward narrowly missed a desperation half @-@ court shot court as time expired , making the final margin 61 – 59 . + The loss snapped Butler 's 25 @-@ game winning streak , the longest in school history . Butler became the smallest school to play for a National Championship since Jacksonville in 1970 . Stevens became the second @-@ youngest head coach to coach in the NCAA National Championship Game , behind Branch McCracken who led the Indiana Hoosiers to the 1940 National Championship at age 31 . Stevens was named as both a Hugh Durham and Jim Phelan Award finalist for the third consecutive year , losing to Mike Young and Jamie Dixon respectively . He was also a finalist for the Skip Prosser Man of the Year Award , which was won by Bob Marlin . + Butler finished the year ranked # 2 in the Coaches ' Poll , the highest ranking in school history . The school was ranked for 19 consecutive weeks , tying the school record . + + = = = 2010 off @-@ season = = = + + After the end of the 2009 – 10 season , Brad Stevens and Butler continued to attract considerable attention . President Barack Obama personally called Stevens to congratulate him on Butler 's season . David Letterman had Stevens on his show for a guest appearance . Butler admissions inquiries shot up 67 % . Stevens received fan letters from around the world , and his phone rang off the hook . He was invited to throw the ceremonial first pitch before the Chicago Cubs vs. Florida Marlins game in Chicago on May 10 . " It 's all been very surreal , " Stevens said . " If you are the runner @-@ up , you don 't expect to talk to the president . " " It 's been a little overwhelming , because I 'm a pretty simple guy , " he added . + The 2009 – 10 season also helped increase Butler 's recruiting profile . Asked if the increased fame would change things , Stevens said it better not spoil him or the university . " I look at this new challenge of not changing and sticking to your core values and making sure you remain humble as a great coaching opportunity . " + + = = = 2010 – 11 season = = = + + Rankings by ESPN 's Andy Katz and Fox Sports ' Jeff Goodman released shortly after the 2010 Championship game both had Butler third for the 2010 – 2011 season . Duke coach Mike Krzyzewski agreed , saying Butler would be " right up there , No. 1 or No. 2 ... They 'll be a favorite next year . " However , Hayward chose to leave early for the NBA Draft and Butler went through a rough patch early in the season , at one point losing three straight games and having a 6 – 5 conference record . Bolstered by the emergence of Andrew Smith at center and Matt Howard 's success as a perimeter forward , Butler ended up winning a share of the conference title at 13 – 5 . The Bulldogs then won the Horizon League Tournament to secure an automatic NCAA tournament bid , and received an 8 seed . + Picked by many to lose a first @-@ round match @-@ up against Old Dominion , Butler advanced on a last @-@ second tip @-@ in by Howard . Howard was also clutch in their next game , hitting a free throw with a less than one second remaining to beat Pitt in a dramatic finish . Shelvin Mack scored 30 points in the win . Butler won their next game when they defeated Wisconsin . On March 26 , 2011 , the Bulldogs beat Florida 74 – 71 in overtime to earn back @-@ to @-@ back trips to the Final Four . On April 2 , Butler beat fellow Cinderella team VCU 70 – 62 to make it to a second consecutive national championship game . For the second consecutive year , the Bulldogs fell in the national championship game , this time to Connecticut . + + = = = Coaching future = = = + + On April 8 , 2010 , Stevens signed a long @-@ term deal with Butler , extending his contract through the 2021 – 22 season . Financial terms of the contract were not disclosed ; however , Butler president Bobby Fong had publicly stated that the university could afford to increase Stevens ' base salary to approximately US $ 1 @,@ 000 @,@ 000 a few days prior . Stevens had previously made US $ 395 @,@ 000 plus benefits in base salary , a relatively low figure for a successful Division I head basketball coach . His total compensation for 2009 – 10 was estimated at US $ 750 @,@ 000 . Stevens had received a raise after each of his three seasons at Butler and his contract contains a buyout clause estimated in the high six or low seven figures . + By re @-@ signing with Butler , Stevens temporarily ended speculation that he would leave the university for a higher paying job . Oregon , Clemson , and Wake Forest were all said to be interested in offering Stevens multi @-@ million dollar contracts to leave Butler . " First and foremost , I ’ m loyal to Butler , " Stevens said . When asked if he would ever leave Butler , Stevens replied " I guess if they kicked me out . " + After the 2011 – 12 season , Stevens was pursued vigorously by Illinois to fill their coaching vacancy before he declined their offer . + In March 2013 , UCLA reportedly offered Stevens between $ 2 @.@ 5 and $ 3 million a year to leave Butler . Rumors circulated that Stevens was in contract negotiations with UCLA , but ultimately the rumors proved false and Stevens stayed at Butler . Commenting on the situation , a source close to Stevens said " Brad doesn 't understand why people would assume he 's leaving . " A few days later , Stevens reiterated that he was very happy at Butler and had no intentions to leave as long as he had the support of the university to continue running the program the " right way " . + + = = NBA = = + + + = = = Boston Celtics = = = + + On July 3 , 2013 , Stevens was signed as the head coach by the Boston Celtics . Reports state that his new contract is a six @-@ year , $ 22 million deal . In April 2015 , Stevens led the Celtics to the NBA Playoffs as the 7th seed in the Eastern Conference with a 40 – 42 record . On April 21 , 2015 , it was announced that Stevens finished fourth in voting for the NBA 's Coach of the Year Award . On March 1 , 2016 , Stevens was named the Eastern Conference Coach of the Month for games played during February . In April 2016 , Brad Stevens led the Celtics to their second consecutive playoff appearance under his tenure as the 5th seed in the 2016 NBA Playoffs finishing the season with a 48 – 34 record . On June 1 , 2016 , Stevens received a contract extension . + + = = Coaching style = = + + According to Stevens , in one of his first games as head coach , he was nervous and " felt like our team played on edge " because of it . He decided that a team 's play will reflect the mood of its coach ; a calm coach means a team that will remain poised in difficult game situations , while a nervous coach means a team that plays on edge . " I don ’ t want to lose a game because of my approach , " he told himself . Accordingly , he developed a strategy of always remaining calm and focused during games . He rarely raises his voice or gets emotional , instead quietly observing on the sideline with folded arms . He does not get upset about bad calls by referees or player mistakes , preferring to focus on " the next play " rather than what just happened . Butler player Willie Veasley explained Butler 's 2010 Final Four run by saying , " When those big runs [ by Syracuse and Kansas State ] came , Coach called a timeout and said a few calm words . Then he said he believes in us , he loves us and we 're going to win the game . " On the rare occasion Stevens feels the need to correct a player , he does it with " positive reinforcement , just at a little louder decibel " , according to former assistant coach Matthew Graves . Above all , Stevens wants his players to be confident , not living in fear of being yanked for making a bad play . + Externally , Stevens is always calm , but internally he is far from it . " I 'm not as calm as everybody thinks , " Stevens says . His wife Tracy adds , " He ’ s calm and collected , but he ’ s fiercely competitive . He ’ s always thinking about how he can beat you . " Former player Joel Cornette says " Everyone sees Brad as a level @-@ headed , calm and cool coach , but he ’ s about as competitive of a guy as I know . We would get into it constantly , whether playing two @-@ on @-@ two or arguing about players ’ having better college careers . " + Stevens spends a lot of time preparing for each game , and always tries to add a few new wrinkles specific to that game 's opponent . Sports Illustrated calls Stevens an expert " on breaking down tape and looking at statistical trends to find opponents ' weaknesses . " Former player Ronald Nored agrees : " We know everything we need to about our opponents , all their tendencies are broken down " ahead of time . + Stevens is a proponent of using statistical analysis to enhance his coaching decisions , spending almost as much time looking at statistics as watching game film . " I think it 's a unique way of looking at the game that may be able to help best communicate to your players " , he explains . For example , when Butler was slumping in late 2010 , Stevens challenged his team : " this [ 46 % defensive field goal percentage ] is where we are . This isn 't acceptable to get to where we want to go . But what does that really mean ? It 's not just get better defensively , it is , if we give up 3 less baskets a game , then we will be at 40 percent field goal percentage defense which will be top 20 in the country " . The team got the message , improved throughout the season , and ultimately went on a March run fueled by defense . In 2012 , Stevens became the first college coach to hire someone solely for statistical research when he added Drew Cannon to the staff . If he had the resources , Stevens says he would hire a team of statisticians to analyze the teams play . + Stevens ' teams are built around solid basketball fundamentals and good team work , rather than individual basketball skill . His teams are known for their defense , forcing opponents into uncharacteristic mistakes . The secret to basketball – and life – is " just to do the job to the best of your ability and don 't worry about anything else , " Stevens says . " Win the next game . Win the next possession . That 's our focus . It 's boring . It 's also the way championships are won " , he says . In short , Stevens is a strong believer in " The Butler Way " – doing all the little things that transform a group of good basketball players into a great basketball team . " I tell the players ' the Butler Way ' isn 't easy to define , " Stevens says , " but you can see it on the floor when we share the basketball , play with great energy and defend . " + Stevens prefers to recruit strong team players instead of going after " top recruits . " " The guys we [ have ] recruited , most of them weren 't very highly ranked , " Stevens says . " They had very good high school careers or careers at other places ( transfers ) , but for one reason or the other they weren 't seen as great players . But they all had intangibles . " Stevens puts a strong emphasis on education and has said he would only recruit a " one and done " player if he was committed to getting his degree while playing professionally . + Stevens has often been referred to as a coaching prodigy , but is not interested in self @-@ promotion . He instead prefers to deflect the praise he receives to the players , athletic department , and his mentors . He has not been known to posture for more money , or to leak his name for open coaching positions . He has been described as humble , modest , and not " about the money " . + The New York Times , USA Today , ESPN , and other commentators have attributed Butler 's success against teams with superior athletes to Stevens ' coaching style . The Times remarks , " the Bulldogs are very well prepared for their opponents , and they do not rattle easily " , and says that the resulting confidence has led to the team 's success . " He coaches to his personality and to his strengths , " Collier says . " Obviously , he has great rapport and communication ability with his team . " Yahoo ! Sports compared Stevens to legendary coach John Wooden writing " Brad Stevens is winning at Butler the Wooden way – calm and composed on the sideline . " Wooden agreed , saying , " I enjoy watching [ Stevens ] and very much enjoy [ Butler 's ] style of play . " + + = = Personal life = = + + Brad Stevens is known for his youthful looks , often being described as " baby @-@ faced " . One commentator remarked , " Stevens looks like he checks the mirror every morning to see if it 's time to start shaving . " On occasion , he has been mistaken for a player . He is also known for projecting a professional , " corporate " look from the sidelines . Asked what his life would be like if he had never taken up coaching , he replies " If everything else remained the same , I would have been as happy as heck ... Friends and family and faith , they 're going to take the cake over all this stuff . " Stevens met his wife , Tracy ( née Wilhelmy ) , while attending DePauw University . Tracy , who played soccer for DePauw , quickly learned of Brad 's love for basketball ; on their third date he drove her an hour and a half to attend a high school basketball game . Tracy graduated from Rocky River High School in 1995 , and from DePauw in 1999 . She returned to school in 2000 , driving five hours from Case Western 's law school to Indianapolis on weekends to see Brad . She finished her final year of law school in Indianapolis , and the couple married in August 2003 . Tracy works as a labor and employment lawyer . Tracy also serves as Brad 's agent . + Brad and Tracy Stevens are involved with the American Cancer Society 's Coaches Vs . Cancer . Brad says that the cause really hit home for them after Tracy 's mother died of the disease in June 2004 . The day before Butler 's 2010 Final Four appearance , they hosted a fundraiser for the organization . Brad Stevens has also volunteered his time to the Jukes Foundation for Kids , a charity benefiting Ugandan children run by former Butler player Avery Jukes . Stevens remains in close touch with the Butler basketball family ; he notably took a one @-@ game leave from the Celtics in January 2016 to visit with Andrew Smith , a player on both of Butler 's Final Four teams who was dying of cancer ; Smith died less than a week later . At the request of Andrew 's widow , Sam , Brad delivered the eulogy at the memorial service on January 17 , 2016 . + Stevens ' father , Mark , is an orthopedic surgeon in Indianapolis and former Indiana Hoosiers football player . His mother , Jan , is a university professor . She has previously taught at Butler . + + = = Head coaching record = = + + + = = = College = = = + + + = = = NBA = = = + + + = = = Awards and nominations = = = + + Henry Iba Coach of the Year Award finalist ( 2009 ) + Horizon League Coach of the Year ( 2009 , 2010 ) + Hugh Durham Award for Mid @-@ major Coach of the Year finalist ( 2008 , 2009 , 2010 ) + Hugh Durham Award Mid @-@ season honors ( 2009 ) + Jim Phelan National Coach of the Year Award finalist ( 2008 , 2009 , 2010 ) + Skip Prosser Man of the Year Award finalist ( 2010 ) + + + = Shackleton ( crater ) = + + Shackleton is an impact crater that lies at the south pole of the Moon . The peaks along the crater 's rim are exposed to almost continual sunlight , while the interior is perpetually in shadow ( a Crater of eternal darkness ) . The low @-@ temperature interior of this crater functions as a cold trap that may capture and freeze volatiles shed during comet impacts on the Moon . Measurements by the Lunar Prospector spacecraft showed higher than normal amounts of hydrogen within the crater , which may indicate the presence of water ice . The crater is named after Antarctic explorer Ernest Shackleton . + + = = Description = = + + The rotational axis of the Moon lies within Shackleton , only a few kilometers from its center . The crater is 21 km in diameter and 4 @.@ 2 km deep . From the Earth , it is viewed edge @-@ on in a region of rough , cratered terrain . It is located within the South Pole @-@ Aitken basin on a massif . The rim is slightly raised about the surrounding surface and it has an outer rampart that has been only lightly impacted . No significant craters intersect the rim , and it is sloped about 1 @.@ 5 ° toward the direction 50 – 90 ° from the Earth . The age of the crater is about 3 @.@ 6 billion years and it has been in the proximity of the south lunar pole for at least the last two billion years . + Because the orbit of the Moon is tilted only 5 ° from the ecliptic , the interior of this crater lies in perpetual darkness . Estimates of the area in permanent shadow were obtained from Earth @-@ based radar studies . Peaks along the rim of the crater are almost continually illuminated by sunlight , spending about 80 – 90 % of each lunar orbit exposed to the Sun . Continuously illuminated mountains have been termed peaks of eternal light and have been predicted to exist since the 1900s . + The shadowed portion of the crater was imaged with the Terrain Camera of the Japanese SELENE spacecraft using the illumination of sunlight reflected off the rim . The interior of the crater consists of a symmetrical 30 ° slope that leads down to a 6 @.@ 6 km diameter floor . The handful of craters along the interior span no more than a few hundred meters . The bottom is covered by an uneven mound @-@ like feature that is 300 to 400 m thick . The central peak is about 200 m in height . + The continuous shadows in the south polar craters cause the floors of these formations to maintain a temperature that never exceeds about 100 K. For Shackleton , the average temperature was determined to be about 90 K , reaching 88 K at the crater floor . Under these conditions , the estimated rate of loss from any ice in the interior would be 10 − 26 to 10 − 27 m / s . Any water vapor that arrives here following a cometary impact on the Moon would lie permanently frozen on or below the surface . However , the surface albedo of the crater floor matches the lunar far @-@ side , suggesting that there is no exposed surface ice . + This crater was named after Ernest Henry Shackleton , an Anglo @-@ Irish explorer of Antarctica from 1901 until his death in 1922 . The name was officially adopted by the International Astronomical Union in 1994 . Nearby craters of note include Shoemaker , Haworth , de Gerlache , Sverdrup , and Faustini . Somewhat farther away , on the eastern hemisphere of the lunar near side , are the larger craters Amundsen and Scott , named after two other early explorers of the Antarctic continent . + + = = Exploration = = + + From the perspective of the Earth , this crater lies along the southern limb of the Moon , making observation difficult . Detailed mapping of the polar regions and farside of the Moon did not occur until the advent of orbiting spacecraft . Shackleton lies entirely within the rim of the immense South Pole @-@ Aitken basin , which is one of the largest known impact formations in the Solar System . This basin is over 12 kilometers deep , and an exploration of its properties could provide useful information about the lunar interior . + A neutron spectrometer on board the Lunar Prospector spacecraft detected enhanced concentrations of hydrogen close to the northern and southern lunar poles , including the crater Shackleton . At the end of this mission in July 1999 , the spacecraft was crashed into the nearby crater Shoemaker in the hope of detecting from Earth @-@ based telescopes an impact @-@ generated plume containing water vapor . The impact event did not produce any detectable water vapor , and this may be an indication that the hydrogen is not in the form of hydrated minerals , or that the impact site did not contain any ice . Alternatively , it is possible that the crash did not excavate deeply enough into the regolith to liberate significant quantities of water vapor . + From Earth @-@ based radar and spacecraft images of the crater edge , Shackleton appears to be relatively intact ; much like a young crater that has not been significantly eroded from subsequent impacts . This may mean that the inner sides are relatively steep , which may make traversing the sides relatively difficult for a robotic vehicle . In addition , it is possible that the interior floor might not have collected a significant quantity of volatiles since its formation . However other craters in the vicinity are considerably older , and may contain significant deposits of hydrogen , possibly in the form of water ice . ( See Shoemaker ( lunar crater ) , for example . ) + Radar studies preceding and following the Lunar Prospector mission demonstrate that the inner walls of Shackleton are similar in reflective characteristics to those of some sunlit craters . In particular , the surroundings appear to contain a significant number of blocks in its ejecta blanket , suggesting that its radar properties are a result of surface roughness , and not ice deposits , as was previously suggested from a radar experiment involving the Clementine mission . This interpretation , however , is not universally agreed upon within the scientific community . Radar images of the crater at a wavelength of 13 cm show no evidence for water ice deposits . + Optical imaging inside the crater was done for the first time by the Japanese lunar orbiter spacecraft Kaguya in 2007 . It did not have any evidence of significant amount of water ice , down to the image resolution of 10 m per pixel . + On November 15 , 2008 , a 34 @-@ kg probe made a hard landing near the crater . The moon impact probe ( MIP ) was launched from the Indian Chandrayaan @-@ I spacecraft and reached the surface 25 minutes later . The probe carried a radar altimeter , video imaging system , and a mass spectrometer , which will be used to search for water . + + = = Potential uses = = + + Some sites along Shackleton 's rim receive almost constant illumination . At these locales sunlight is almost always available for conversion into electricity using solar panels , potentially making them good locations for future Moon landings . The temperature at this site is also more favorable than at more equatorial latitudes as it does not experience the daily temperature extremes of 100 ° C when the Sun is overhead , to as low as − 150 ° C during the lunar night . + While scientific experiments performed by Clementine and Lunar Prospector could indicate the presence of water in the polar craters , the current evidence is far from definitive . There are doubts among scientists as to whether or not the hydrogen is in the form of ice , as well as to the concentration of this " ore " with depth below the surface . Resolution of this issue will require future missions to the Moon . The presence of water suggests that the crater floor could potentially be " mined " for deposits of hydrogen in water form , a commodity that is expensive to deliver directly from the Earth . + This crater has also been proposed as a future site for a large infrared telescope . The low temperature of the crater floor makes it ideal for infrared observations , and solar cells placed along the rim could provide near @-@ continuous power to the observatory . About 120 kilometers from the crater lies the 5 @-@ km tall Malapert Mountain , a peak that is perpetually visible from the Earth , and which could serve as a radio relay station when suitably equipped . + NASA has named the rim of Shackleton as a potential candidate for its lunar outpost , slated to be up and running by 2020 and continuously staffed by a crew by 2024 . The location would promote self @-@ sustainability for lunar residents , as perpetual sunlight on the south pole would provide energy for solar panels . Furthermore , the shadowed polar regions are believed to contain the frozen water necessary for human consumption and could also be harvested for fuel manufacture . + + + = American Beauty ( 1999 film ) = + + American Beauty is a 1999 American drama film directed by Sam Mendes and written by Alan Ball . Kevin Spacey stars as Lester Burnham , a 42 @-@ year @-@ old advertising executive who has a midlife crisis when he becomes infatuated with his teenaged daughter 's best friend , Angela ( Mena Suvari ) . Annette Bening co @-@ stars as Lester 's materialistic wife , Carolyn , and Thora Birch plays their insecure daughter , Jane . Wes Bentley , Chris Cooper , and Allison Janney also feature . The film is described by academics as a satire of American middle @-@ class notions of beauty and personal satisfaction ; analysis has focused on the film 's explorations of romantic , and paternal love , sexuality , beauty , materialism , self @-@ liberation , and redemption . + Ball began writing American Beauty as a play in the early 1990s , partly inspired by the media circus around the Amy Fisher trial in 1992 . He shelved the play after realizing the story would not work on stage . After several years as a television screenwriter , Ball revived the idea in 1997 when attempting to break into the film industry . The modified script had a cynical outlook that was influenced by Ball 's frustrating tenures writing for several sitcoms . Producers Dan Jinks and Bruce Cohen took American Beauty to DreamWorks ; the then @-@ fledgling film studio bought Ball 's script for $ 250 @,@ 000 , outbidding several other production bodies . DreamWorks financed the $ 15 million production and served as its North American distributor . American Beauty marked acclaimed theater director Mendes ' film debut ; courted after his successful productions of the musicals Oliver ! and Cabaret , Mendes was , nevertheless , only given the job after 20 others were considered and several " A @-@ list " directors turned down the opportunity . + Spacey was Mendes ' first choice for the role of Lester , though DreamWorks had urged the director to consider better @-@ known actors ; similarly , the studio suggested several actors for the role of Carolyn until Mendes offered the part to Bening without DreamWorks ' knowledge . Principal photography took place between December 1998 and February 1999 on soundstages at the Warner Bros. backlot in Burbank , California , and on location in Los Angeles . Mendes ' dominant style was deliberate and composed ; he made extensive use of static shots and slow pans and zooms to generate tension . Cinematographer Conrad Hall complemented Mendes ' style with peaceful shot compositions to contrast with the turbulent on @-@ screen events . During editing , Mendes made several changes that gave the film a less cynical tone than the script . + Released in North America on September 17 , 1999 , American Beauty was positively received by critics and audiences ; it was the best @-@ reviewed American film of the year and grossed over $ 356 million worldwide . Reviewers praised most aspects of the production , with particular emphasis on Mendes , Spacey , and Ball ; criticism focused on the familiarity of the characters and setting . DreamWorks launched a major campaign to increase the film 's chances of Academy Award success ; at the 72nd Academy Awards the following year , the film won Best Picture , Best Director , Best Actor ( for Spacey ) , Best Original Screenplay , and Best Cinematography . It was nominated for and won many other awards and honors , mainly for the direction , writing , and acting . + + = = Plot = = + + Lester Burnham is a middle @-@ aged advertising executive and magazine writer who despises his job . He is unhappily married to Carolyn , a neurotic yet fiercely ambitious real estate broker ; their teenaged daughter , Jane , abhors her parents and has low self @-@ esteem . The Burnhams ' new neighbors are retired United States Marine Corps Colonel Frank Fitts and his near @-@ catatonic wife , Barbara . Their teenaged son , Ricky , constantly films his surroundings with a camcorder , collecting hundreds of recordings on videotapes in his bedroom . His job as a part @-@ time bar caterer serves as a front for his secret marijuana dealing . Col. Fitts is a strict disciplinarian who previously sent Ricky to a military school and briefly committed him to a psychiatric hospital . Jim Olmeyer and Jim Berkley , a gay couple who live nearby , welcome the family to the neighborhood ; the homophobic Col. Fitts angrily asks Ricky " why these faggots have to rub it in your face . " + Lester becomes infatuated with Jane 's vain friend , Angela Hayes , after seeing her perform a half @-@ time dance routine at a high school basketball game . He starts having sexual fantasies about Angela , in which red rose petals are a recurring motif . Carolyn begins an affair with her married business rival , Buddy Kane . When Lester 's boss , Brad , tells him that he is to be laid off , Lester instead blackmails him for $ 60 @,@ 000 and quits his job . Lester takes a minimum @-@ wage job at a fast @-@ food restaurant , trades in his Toyota Camry for his dream car , a 1970 Pontiac Firebird , and starts working out after he overhears Angela tell Jane that she would find him sexually attractive if he got in shape . He begins smoking marijuana supplied by Ricky . The girls ' friendship wanes after Jane starts a relationship with Ricky . Jane and Ricky bond over what Ricky considers the most beautiful imagery he has filmed : a plastic bag being blown in the wind . + Lester discovers Carolyn 's infidelity , but reacts indifferently . Buddy ends the affair , fearing an expensive divorce . Col. Fitts becomes suspicious of Lester and Ricky 's friendship when he finds his son 's footage of Lester lifting weights while nude , which Ricky captured by chance , leading him to believe that Ricky is gay . After spying on Ricky and Lester through Lester 's garage window , the colonel mistakenly concludes the pair is sexually involved . He later confronts and beats Ricky for the supposed affair and accuses him of being gay . Ricky falsely admits the charges and goads his father into kicking him out of their home . Meanwhile , Carolyn is sitting in her car in the rain , taking a gun out of the glove box while a voice on the radio talks about not being a victim . Ricky goes to Jane 's bedroom , finding her arguing with Angela about Angela 's flirtation with Lester . Ricky convinces Jane to flee with him to New York City and assures Angela that she is ugly , boring , and ordinary . + Col. Fitts confronts Lester and attempts to kiss him ; Lester rebuffs the colonel , who tearfully flees . Carolyn puts the gun in her handbag , shouting , " I refuse to be a victim ! " Lester finds a distraught Angela sitting alone in the dark ; she asks him to tell her she is beautiful . He does , and they kiss . + Carolyn drives through the rain , rehearsing a confession to Lester . Just as Lester and Angela are about to have sex , she admits that she is a virgin , and Lester changes his mind . He instead comforts her and the pair bond over their shared frustrations . Angela goes to the bathroom and Lester smiles at a family photograph in his kitchen . An unseen figure raises a gun to the back of his head , a gunshot sounds , and blood sprays on the wall . Ricky and Jane find Lester 's body , while Carolyn breaks down crying in the closet . A bloodied Col. Fitts returns home , where a gun is shown to be missing from his collection . Lester 's closing narration describes meaningful experiences during his life ; he says that , despite his death , he is happy because there is " so much beauty " in the world . + + = = Themes and analysis = = + + + = = = Multiple interpretations = = = + + Scholars and academics have offered many possible readings of American Beauty ; film critics are similarly divided , not so much about the quality of the film , as their interpretations of it . Described by many as about " the meaning of life " or " the hollow existence of the American suburbs " , the film has defied categorization by even the filmmakers . Mendes is indecisive , saying the script seemed to be about something different each time he read it : " a mystery story , a kaleidoscopic journey through American suburbia , a series of love stories ; [ ... ] it was about imprisonment , [ ... ] loneliness , [ and ] beauty . It was funny ; it was angry , sad . " The literary critic and author Wayne C. Booth concludes that the film resists any one interpretation : " [ American Beauty ] cannot be adequately summarized as ' here is a satire on what 's wrong with American life ' ; that plays down the celebration of beauty . It is more tempting to summarize it as ' a portrait of the beauty underlying American miseries and misdeeds ' , but that plays down the scenes of cruelty and horror , and Ball 's disgust with our mores . It cannot be summarized with either Lester or Ricky 's philosophical statements about what life is or how one should live . " He argues that the problem of interpreting the film is tied with that of finding its center — a controlling voice who " [ unites ] all of the choices " . He contends that in American Beauty 's case , it is neither Mendes nor Ball . Mendes considers the voice to be Ball 's , but even while the writer was " strongly influential " on set , he often had to accept deviations from his vision , particularly ones that transformed the cynical tone of his script into something more optimistic . With " innumerable voices intruding on the original author 's , " Booth says , those who interpret American Beauty " have forgotten to probe for the elusive center " . According to Booth , the film 's true controller is the creative energy " that hundreds of people put into its production , agreeing and disagreeing , inserting and cutting " . + + = = = Imprisonment and redemption = = = + + Mendes called American Beauty a rite of passage film about imprisonment and escape from imprisonment . The monotony of Lester 's existence is established through his gray , nondescript workplace and characterless clothing . In these scenes , he is often framed as if trapped , " reiterating rituals that hardly please him " . He masturbates in the confines of his shower ; the shower stall evokes a jail cell and the shot is the first of many where Lester is confined behind bars or within frames , such as when he is reflected behind columns of numbers on a computer monitor , " confined [ and ] nearly crossed out " . The academic and author Jody W. Pennington argues that Lester 's journey is the story 's center . His sexual reawakening through meeting Angela is the first of several turning points as he begins to " [ throw ] off the responsibilities of the comfortable life he has come to despise " . After Lester shares a joint with Ricky , his spirit is released and he begins to rebel against Carolyn . Changed by Ricky 's " attractive , profound confidence " , Lester is convinced that Angela is attainable and sees that he must question his " banal , numbingly materialist suburban existence " ; he takes a job at a fast @-@ food outlet , which allows him to regress to a point when he could " see his whole life ahead of him " . + When Lester is caught masturbating by Carolyn , his angry retort about their lack of intimacy is the first time he says aloud what he thinks about her . By confronting the issue and Carolyn 's " superficial investments in others " , Lester is trying to " regain a voice in a home that [ only respects ] the voices of mother and daughter " . His final turning point comes when Angela and he almost have sex ; after she confesses her virginity , he no longer thinks of her as a sex object , but as a daughter . He holds her close and " wraps her up " . Mendes called it " the most satisfying end to [ Lester 's ] journey there could possibly have been " . With these final scenes , Mendes intended to show Lester at the conclusion of a " mythical quest " . After Lester gets a beer from the refrigerator , the camera pushes toward him , then stops facing a hallway down which he walks " to meet his fate " . Having begun to act his age again , Lester achieves closure . As he smiles at a family photo , the camera pans slowly from Lester to the kitchen wall , onto which blood spatters as a gunshot rings out ; the slow pan reflects the peace of Lester 's death . His body is discovered by Jane and Ricky . Mendes said that Ricky 's staring into Lester 's dead eyes is " the culmination of the theme " of the film : that beauty is found where it is least expected . + + = = = Conformity and beauty = = = + + Like other American films of 1999 — such as Fight Club , Bringing Out the Dead , and Magnolia , American Beauty instructs its audience to " [ lead ] more meaningful lives " . The film argues the case against conformity , but does not deny that people need and want it ; even the gay characters just want to fit in . Jim and Jim , the Burnhams ' other neighbors , are a satire of " gay bourgeois coupledom " , who " [ invest ] in the numbing sameness " that the film criticizes in heterosexual couples . The feminist academic and author Sally R. Munt argues that American Beauty uses its " art house " trappings to direct its message of nonconformity primarily to the middle classes , and that this approach is a " cliché of bourgeois preoccupation ; [ ... ] the underlying premise being that the luxury of finding an individual ' self ' through denial and renunciation is always open to those wealthy enough to choose , and sly enough to present themselves sympathetically as a rebel . " + Professor Roy M. Anker argues that the film 's thematic center is its direction to the audience to " look closer " . The opening combines an unfamiliar viewpoint of the Burnhams ' neighborhood with Lester 's narrated admission that he will soon die , forcing audiences to consider their own mortality and the beauty around them . It also sets a series of mysteries ; Anker asks , " from what place exactly , and from what state of being , is he telling this story ? If he 's already dead , why bother with whatever it is he wishes to tell about his last year of being alive ? There is also the question of how Lester has died — or will die . " Anker believes the preceding scene — Jane 's discussion with Ricky about the possibility of his killing her father — adds further mystery . Professor Ann C. Hall disagrees ; she says by presenting an early resolution to the mystery , the film allows the audience to put it aside " to view the film and its philosophical issues " . Through this examination of Lester 's life , rebirth and death , American Beauty satirizes American middle class notions of meaning , beauty and satisfaction . Even Lester 's transformation only comes about because of the possibility of sex with Angela ; he therefore remains a " willing devotee of the popular media 's exaltation of pubescent male sexuality as a sensible route to personal wholeness " . Carolyn is similarly driven by conventional views of happiness ; from her belief in " house beautiful " domestic bliss to her car and gardening outfit , Carolyn 's domain is a " fetching American millennial vision of Pleasantville , or Eden " . The Burnhams are unaware that they are " materialists philosophically , and devout consumers ethically " who expect the " rudiments of American beauty " to give them happiness . Anker argues that " they are helpless in the face of the prettified economic and sexual stereotypes [ ... ] that they and their culture have designated for their salvation . " + The film presents Ricky as its " visionary , [ ... ] spiritual and mystical center " . He sees beauty in the minutiae of everyday life , videoing as much as he can for fear of missing it . He shows Jane what he considers the most beautiful thing he has filmed : a plastic bag , tossing in the wind in front of a wall . He says capturing the moment was when he realized that there was " an entire life behind things " ; he feels that " sometimes there 's so much beauty in the world I feel like I can 't take it ... and my heart is going to cave in . " Anker argues that Ricky , in looking past the " cultural dross " , has " [ grasped ] the radiant splendor of the created world " to see God . As the film progresses , the Burnhams move closer to Ricky 's view of the world . Lester only forswears personal satisfaction at the film 's end . On the cusp of having sex with Angela , he returns to himself after she admits her virginity . Suddenly confronted with a child , he begins to treat her as a daughter ; in doing so , Lester sees himself , Angela , and his family " for the poor and fragile but wondrous creatures they are " . He looks at a picture of his family in happier times , and dies having had an epiphany that infuses him with " wonder , joy , and soul @-@ shaking gratitude " — he has finally seen the world as it is . + According to Patti Bellantoni , colors are used symbolically throughout the film , none more so than red , which is an important thematic signature that drives the story and " [ defines ] Lester 's arc " . First seen in drab colors that reflect his passivity , Lester surrounds himself with red as he regains his individuality . The American Beauty rose is repeatedly used as symbol ; when Lester fantasizes about Angela , she is usually naked and surrounded by rose petals . In these scenes , the rose symbolizes Lester 's desire for her . When associated with Carolyn , the rose represents a " façade for suburban success " . Roses are included in almost every shot inside the Burnhams ' home , where they signify " a mask covering a bleak , unbeautiful reality " . Carolyn feels that " as long as there can be roses , all is well " . She cuts the roses and puts them in vases , where they adorn her " meretricious vision of what makes for beauty " and begin to die . The roses in the vase in the Angela – Lester seduction scene symbolize Lester 's previous life and Carolyn ; the camera pushes in as Lester and Angela get closer , finally taking the roses — and thus Carolyn — out of the shot . Lester 's epiphany at the end of the film is expressed by rain and the use of red , building to a crescendo that is a deliberate contrast to the release Lester feels . The constant use of red " lulls [ the audience ] subliminally " into becoming used to it ; consequently , it leaves the audience unprepared when Lester is shot and his blood spatters on the wall . + + = = = Sexuality and repression = = = + + Pennington argues that American Beauty defines its characters through their sexuality . Lester 's attempts to relive his youth are a direct result of his lust for Angela , and the state of his relationship with Carolyn is in part shown through their lack of sexual contact . Also sexually frustrated , Carolyn has an affair that takes her from " cold perfectionist " to a more carefree soul who " [ sings ] happily along with " the music in her car . Jane and Angela constantly reference sex , through Angela 's descriptions of her supposed sexual encounters and the way the girls address each other . Their nude scenes are used to communicate their vulnerability . By the end of the film , Angela 's hold on Jane has weakened until the only power she has over her friend is Lester 's attraction to her . Col. Fitts reacts with disgust to meeting Jim and Jim ; he asks , " How come these faggots always have to rub it in your face ? How can they be so shameless ? " To which Ricky replies , " That 's the thing , Dad — they don 't feel like it 's anything to be ashamed of . " Pennington argues that Col. Fitts ' reaction is not homophobic , but an " anguished self @-@ interrogation " . + With other turn @-@ of @-@ the @-@ millennium films such as Fight Club , In the Company of Men ( 1997 ) , American Psycho ( 2000 ) , and Boys Don 't Cry ( 1999 ) , American Beauty " raises the broader , widely explored issue of masculinity in crisis " . Professor Vincent Hausmann charges that in their reinforcement of masculinity " against threats posed by war , by consumerism , and by feminist and queer challenges " , these films present a need to " focus on , and even to privilege " aspects of maleness " deemed ' deviant ' " . Lester 's transformation conveys " that he , and not the woman , has borne the brunt of [ lack of being ] " and he will not stand for being emasculated . Lester 's attempts to " strengthen traditional masculinity " conflict with his responsibilities as a father . Although the film portrays the way Lester returns to that role positively , he does not become " the hypermasculine figure implicitly celebrated in films like Fight Club " . Hausmann concludes that Lester 's behavior toward Angela is " a misguided but nearly necessary step toward his becoming a father again " . + Hausmann says the film " explicitly affirms the importance of upholding the prohibition against incest " ; a recurring theme of Ball 's work is his comparison of the taboos against incest and homosexuality . Instead of making an overt distinction , American Beauty looks at how their repression can lead to violence . Col. Fitts is so ashamed of his homosexuality that it drives him to murder Lester . Ball said , " The movie is in part about how homophobia is based in fear and repression and about what [ they ] can do . " The film implies two unfulfilled incestuous desires : Lester 's pursuit of Angela is a manifestation of his lust for his own daughter , while Col. Fitts ' repression is exhibited through the almost sexualized discipline with which he controls Ricky . Consequently , Ricky realizes that he can only hurt his father by falsely telling him he is homosexual , while Angela 's vulnerability and submission to Lester reminds him of his responsibilities and the limits of his fantasy . Col. Fitts represents Ball 's father , whose repressed homosexual desires led to his own unhappiness . Ball rewrote Col. Fitts to delay revealing him as homosexual , which Munt reads as a possible " deferment of Ball 's own patriarchal @-@ incest fantasies " . + + = = = Temporality and music = = = + + American Beauty follows a traditional narrative structure , only deviating with the displaced opening scene of Jane and Ricky from the middle of the story . Although the plot spans one year , the film is narrated by Lester at the moment of his death . Jacqueline Furby says that the plot " occupies [ ... ] no time [ or ] all time " , citing Lester 's claim that life did not flash before his eyes , but that it " stretches on forever like an ocean of time " . Furby argues that a " rhythm of repetition " forms the core of the film 's structure . For example , two scenes have the Burnhams sitting down to an evening meal , shot from the same angle . Each image is broadly similar , with minor differences in object placement and body language that reflect the changed dynamic brought on by Lester 's new @-@ found assertiveness . Another example is the pair of scenes in which Jane and Ricky film each other . Ricky films Jane from his bedroom window as she removes her bra , and the image is reversed later for a similarly " voyeuristic and exhibitionist " scene in which Jane films Ricky at a vulnerable moment . + Lester 's fantasies are emphasized by slow- and repetitive @-@ motion shots ; Mendes uses double @-@ and @-@ triple cutbacks in several sequences , and the score alters to make the audience aware that it is entering a fantasy . One example is the gymnasium scene — Lester 's first encounter with Angela . While the cheerleaders perform their half @-@ time routine to " On Broadway " , Lester becomes increasingly fixated on Angela . Time slows to represent his " voyeuristic hypnosis " and Lester begins to fantasize that Angela 's performance is for him alone . " On Broadway " — which provides a conventional underscore to the onscreen action — is replaced by discordant , percussive music that lacks melody or progression . This nondiegetic score is important to creating the narrative stasis in the sequence ; it conveys a moment for Lester that is stretched to an indeterminate length . The effect is one that Stan Link likens to " vertical time " , described by the composer and music theorist Jonathan Kramer as music that imparts " a single present stretched out into an enormous duration , a potentially infinite ' now ' that nonetheless feels like an instant " . The music is used like a visual cue , so that Lester and the score are staring at Angela . The sequence ends with the sudden reintroduction of " On Broadway " and teleological time . + According to Drew Miller of Stylus , the soundtrack " [ gives ] unconscious voice " to the characters ' psyches and complements the subtext . The most obvious use of pop music " accompanies and gives context to " Lester 's attempts to recapture his youth ; reminiscent of how the counterculture of the 1960s combated American repression through music and drugs , Lester begins to smoke cannabis and listen to rock music . Mendes ' song choices " progress through the history of American popular music " . Miller argues that although some may be over familiar , there is a parodic element at work , " making good on [ the film 's ] encouragement that viewers look closer " . Toward the end of the film , Thomas Newman 's score features more prominently , creating " a disturbing tempo " that matches the tension of the visuals . The exception is " Don 't Let It Bring You Down " , which plays during Angela 's seduction of Lester . At first appropriate , its tone clashes as the seduction stops . The lyrics , which speak of " castles burning " , can be seen as a metaphor for Lester 's view of Angela — " the rosy , fantasy @-@ driven exterior of the ' American Beauty ' " — as it burns away to reveal " the timid , small @-@ breasted girl who , like his wife , has willfully developed a false public self " . + + = = Production = = + + + = = = Development = = = + + In 1997 , Alan Ball resolved to move into the film industry after several frustrating years writing for the television sitcoms Grace Under Fire and Cybill . He joined the United Talent Agency , where his representative , Andrew Cannava , suggested he write a spec script to " reintroduce [ himself ] to the town as a screenwriter " . Ball pitched three ideas to Cannava : two conventional romantic comedies and American Beauty , which he had originally conceived as a play in the early 1990s . Despite the story 's lack of an easily marketable concept , Cannava selected American Beauty because he felt it was the one for which Ball had the most passion . While developing the script , Ball created another television sitcom , Oh , Grow Up . He channeled his anger and frustration at having to accede to network demands on that show — and during his tenures on Grace Under Fire and Cybill — into writing American Beauty . + Ball did not expect to sell the script , believing it would act as more of a calling card , but American Beauty drew interest from several production bodies . Cannava passed the script to several producers , including Dan Jinks and Bruce Cohen , who took it to DreamWorks . With the help of executives Glenn Williamson and Bob Cooper , and Steven Spielberg in his capacity as studio partner , Ball was convinced to develop the project at DreamWorks ; he received assurances from the studio — known at the time for its more conventional fare — that it would not " iron the [ edges ] out " . In an unusual move , DreamWorks decided not to option the script ; instead , in April 1998 , the studio bought it outright for $ 250 @,@ 000 , outbidding Fox Searchlight Pictures , October Films , Metro @-@ Goldwyn @-@ Mayer , and Lakeshore Entertainment . DreamWorks planned to make the film for $ 6 – 8 million . + Jinks and Cohen involved Ball throughout the film 's development , including casting and director selection . The producers met with about 20 interested directors , several of whom were considered " A @-@ list " at the time . Ball was not keen on the more well @-@ known directors because he believed their involvement would increase the budget and lead DreamWorks to become " nervous about the content " . Nevertheless , the studio offered the film to Mike Nichols and Robert Zemeckis ; neither accepted . In the same year , Mendes ( then a theater director ) revived the musical Cabaret in New York with fellow director Rob Marshall . Beth Swofford of the Creative Artists Agency arranged meetings for Mendes with studio figures in Los Angeles to see if film direction was a possibility . Mendes came across American Beauty in a pile of eight scripts at Swofford 's house , and knew immediately that it was the one he wanted to make ; early in his career , he had been inspired by how the film Paris , Texas ( 1984 ) presented contemporary America as a mythic landscape and he saw the same theme in American Beauty , as well as parallels with his own childhood . Mendes later met with Spielberg ; impressed by Mendes ' productions of Oliver ! and Cabaret , Spielberg encouraged him to consider American Beauty . + Mendes found that he still had to convince DreamWorks ' production executives to let him direct . He had already discussed the film with Jinks and Cohen , and felt they supported him . Ball was also keen ; having seen Cabaret , he was impressed with Mendes ' " keen visual sense " and thought he did not make obvious choices . Ball felt that Mendes liked to look under the story 's surface , a talent he felt would be a good fit with the themes of American Beauty . Mendes ' background also reassured him , because of the prominent role the playwright usually has in a theater production . Over two meetings — the first with Cooper , Walter Parkes , and Laurie MacDonald , the second with Cooper alone — Mendes pitched himself to the studio . The studio soon approached Mendes with a deal to direct for the minimum salary allowed under Directors Guild of America rules — $ 150 @,@ 000 . Mendes accepted , and later recalled that after taxes and his agent 's commission , he only earned $ 38 @,@ 000 . In June 1998 , DreamWorks confirmed that it had contracted Mendes to direct the film . + + = = = Writing = = = + + Ball was partly inspired by two encounters he had in the early 1990s . In about 1991 – 92 , Ball saw a plastic bag blowing in the wind outside the World Trade Center . He watched the bag for 10 minutes , saying later that it provoked an " unexpected emotional response " . In 1992 , Ball became preoccupied with the media circus around the Amy Fisher trial . Discovering a comic book telling of the scandal , he was struck by how quickly it had become commercialized . He said he " felt like there was a real story underneath [ that was ] more fascinating and way more tragic " than the story presented to the public , and attempted to turn the idea into a play . Ball produced around 40 pages , but stopped when he realized it would work better as a film . He felt that because of the visual themes , and because each character 's story was .. " intensely personal " , it could not be done on a stage . All the main characters appeared in this version , but Carolyn did not feature strongly ; Jim and Jim instead had much larger roles . + Ball based Lester 's story on aspects of his own life . Lester 's re @-@ examination of his life parallels feelings Ball had in his mid @-@ 30s ; like Lester , Ball put aside his passions to work in jobs he hated for people he did not respect . Scenes in Ricky 's household reflect Ball 's own childhood experiences . Ball suspected his father was homosexual and used the idea to create Col. Fitts , a man who " gave up his chance to be himself " . Ball said the script 's mix of comedy and drama was not intentional , but that it came unconsciously from his own outlook on life . He said the juxtaposition produced a starker contrast , giving each trait more impact than if they appeared alone . + In the script that was sent to prospective actors and directors , Lester and Angela had sex ; by the time of shooting , Ball had rewritten the scene to the final version . Ball initially rebuffed counsel from others that he change the script , feeling they were being puritanical ; the final impetus to alter the scene came from DreamWorks ' then @-@ president Walter Parkes . He convinced Ball by indicating that in Greek mythology , the hero " has a moment of epiphany before [ ... ] tragedy occurs " . Ball later said his anger when writing the first draft had blinded him to the idea that Lester needed to refuse sex with Angela to complete his emotional journey — to achieve redemption . Jinks and Cohen asked Ball not to alter the scene right away , as they felt it would be inappropriate to make changes to the script before a director had been hired . Early drafts also included a flashback to Col. Fitts ' service in the Marines , a sequence that unequivocally established his homosexual leanings . In love with another Marine , Col. Fitts sees the man die and comes to believe that he is being punished for the " sin " of being gay . Ball removed the sequence because it did not fit the structure of the rest of the film — Col. Fitts was the only character to have a flashback — and because it removed the element of surprise from Col. Fitts ' later pass at Lester . Ball said he had to write it for his own benefit to know what happened to Col. Fitts , though all that remained in later drafts was subtext . + Ball remained involved throughout production ; he had signed a television show development deal , so had to get permission from his producers to take a year off to be close to American Beauty . Ball was on @-@ set for rewrites and to help interpret his script for all but two days of filming . His original bookend scenes — in which Ricky and Jane are prosecuted for Lester 's murder after being framed by Col. Fitts — were excised in post @-@ production ; the writer later felt the scenes were unnecessary , saying they were a reflection of his " anger and cynicism " at the time of writing ( see " Editing " ) . Ball and Mendes revised the script twice before it was sent to the actors , and twice more before the first read @-@ through . + The shooting script features a scene in Angela 's car in which Ricky and Jane talk about death and beauty ; the scene differed from earlier versions , which set it as a " big scene on a freeway " in which the three witness a car crash and see a dead body . The change was a practical decision , as the production was behind schedule and they needed to cut costs . The schedule called for two days to be spent filming the crash , but only half a day was available . Ball agreed , but only if the scene could retain a line of Ricky 's where he reflects on having once seen a dead homeless woman : " When you see something like that , it 's like God is looking right at you , just for a second . And if you 're careful , you can look right back . " Jane asks : " And what do you see ? " Ricky : " Beauty . " Ball said , " They wanted to cut that scene . They said it 's not important . I said , ' You 're out of your fucking mind . It 's one of the most important scenes in the movie ! ' [ ... ] If any one line is the heart and soul of this movie , that is the line . " Another scene was rewritten to accommodate the loss of the freeway sequence ; set in a schoolyard , it presents a " turning point " for Jane in that she chooses to walk home with Ricky instead of going with Angela . By the end of filming , the script had been through 10 drafts . + + = = = Casting = = = + + Mendes had Spacey and Bening in mind for the leads from the beginning , but DreamWorks executives were unenthusiastic . The studio suggested several alternatives , including Bruce Willis , Kevin Costner , or John Travolta to play Lester , and Helen Hunt or Holly Hunter to play Carolyn . Mendes did not want a big star " weighing the film down " ; he felt Spacey was the right choice based on his performances in the 1995 films The Usual Suspects and Seven , and 1992 's Glengarry Glen Ross . Spacey was surprised ; he said , " I usually play characters who are very quick , very manipulative and smart . [ ... ] I usually wade in dark , sort of treacherous waters . This is a man living one step at a time , playing by his instincts . This is actually much closer to me , to what I am , than those other parts . " Mendes offered Bening the role of Carolyn without the studio 's consent ; although executives were upset at Mendes , by September 1998 , DreamWorks had entered negotiations with Spacey and Bening . + Spacey loosely based Lester 's early " schlubby " deportment on Walter Matthau . During the film , Lester 's physique improves from flabby to toned ; Spacey worked out during filming to improve his body , but because Mendes shot the scenes out of chronological order , Spacey varied postures to portray the stages . Before filming , Mendes and Spacey analyzed Jack Lemmon 's performance in The Apartment ( 1960 ) , because Mendes wanted Spacey to emulate " the way [ Lemmon ] moved , the way he looked , the way he was in that office and the way he was an ordinary man and yet a special man " . Spacey 's voiceover is a throwback to Sunset Boulevard ( 1950 ) , which is also narrated in retrospect by a dead character . Mendes felt it evoked Lester 's — and the film 's — loneliness . Bening recalled women from her youth to inform her performance : " I used to babysit constantly . You 'd go to church and see how people present themselves on the outside , and then be inside their house and see the difference . " Bening and a hair stylist collaborated to create a " PTA president coif " hairstyle , and Mendes and production designer Naomi Shohan researched mail @-@ order catalogs to better establish Carolyn 's environment of a " spotless suburban manor " . To help Bening get into Carolyn 's mindset , Mendes gave her music that he believed Carolyn would like . He lent Bening the Bobby Darin version of the song " Don 't Rain on My Parade " , which she enjoyed and persuaded the director to include it for a scene in which Carolyn sings in her car . + For the roles of Jane , Ricky , and Angela , DreamWorks gave Mendes carte blanche . By November 1998 , Thora Birch , Wes Bentley , and Mena Suvari had been cast in the parts — in Birch 's case , despite the fact she was underage for her nude scene . As Birch was 16 at the time she made the film , and thus classified as a minor in the United States , her parents had to approve her brief topless scene in the movie . Child labor representatives and they were on the set for the shooting of the scene . Bentley overcame competition from top actors under the age of 25 to be cast . The 2009 documentary My Big Break followed Bentley , and several other young actors , before and after he landed the part . To prepare , Mendes provided Bentley with a video camera , telling the actor to film what Ricky would . Peter Gallagher and Alison Janney were cast ( as Buddy Kane and Barbara Fitts ) after filming began in December 1998 . Mendes gave Janney a book of paintings by Edvard Munch . He told her , " Your character is in there somewhere . " Mendes cut much of Barbara 's dialogue , including conversations between Colonel Fitts and her , as he felt that what needed to be said about the pair — their humanity and vulnerability — was conveyed successfully through their shared moments of silence . Chris Cooper plays Colonel Fitts , Scott Bakula plays Jim Olmeyer , and Sam Robards plays Jim Berkley . Jim and Jim were deliberately depicted as the most normal , happy — and boring — couple in the film . Ball 's inspiration for the characters came from a thought he had after seeing a " bland , boring , heterosexual couple " who wore matching clothes : " I can 't wait for the time when a gay couple can be just as boring . " Ball also included aspects of a gay couple he knew who had the same forename . + Mendes insisted on two weeks of cast rehearsals , although the sessions were not as formal as he was used to in the theater , and the actors could not be present at every one . Several improvisations and suggestions by the actors were incorporated into the script . An early scene showing the Burnhams leaving home for work was inserted later on to show the low point that Carolyn and Lester 's relationship had reached . Spacey and Bening worked to create a sense of the love that Lester and Carolyn once had for one another ; for example , the scene in which Lester almost seduces Carolyn after the pair argues over Lester 's buying a car was originally " strictly contentious " . + + = = = Filming = = = + + Principal photography lasted about 50 days from December 14 , 1998 , to February 1999 . American Beauty was filmed on soundstages at the Warner Bros. backlot in Burbank , California , and at Hancock Park and Brentwood in Los Angeles . The aerial shots at the beginning and end of the film were captured in Sacramento , California , and many of the school scenes were shot at South High School in Torrance , California ; several extras in the gym crowd were South High students . The film is set in an upper middle @-@ class neighborhood in an unidentified American town . Production designer Naomi Shohan likened the locale to Evanston , Illinois , but said , " it 's not about a place , it 's about an archetype . [ ... ] The milieu was pretty much Anywhere , USA — upwardly mobile suburbia . " The intent was for the setting to reflect the characters , who are also archetypes . Shohan said , " All of them are very strained , and their lives are constructs . " The Burnhams ' household was designed as the reverse of the Fitts ' — the former a pristine ideal , but graceless and lacking in " inner balance " , leading to Carolyn 's desire to at least give it the appearance of a " perfect all @-@ American household " ; the Fitts ' home is depicted in " exaggerated darkness [ and ] symmetry " . + The production selected two adjacent properties on the Warner backlot 's " Blondie Street " for the Burnham and Fitts ' homes . The crew rebuilt the houses to incorporate false rooms that established lines of sight — between Ricky and Jane 's bedroom windows , and between Ricky 's bedroom and Lester 's garage . The garage windows were designed specifically to obtain the crucial shot toward the end of the film in which Col. Fitts — watching from Ricky 's bedroom — mistakenly assumes that Lester is paying Ricky for sex . Mendes made sure to establish the line of sight early on in the film to make the audience feel a sense of familiarity with the shot . The house interiors were filmed on the backlot , on location , and on soundstages when overhead shots were needed . The inside of the Burnhams ' home was shot at a house close to Interstate 405 and Sunset Boulevard in Los Angeles ; the inside of the Fitts ' home was shot in the city 's Hancock Park neighborhood . Ricky 's bedroom was designed to be cell @-@ like to suggest his " monkish " personality , while at the same time blending with the high @-@ tech equipment to reflect his voyeuristic side . The production deliberately minimized the use of red , as it was an important thematic signature elsewhere . The Burnhams ' home uses cool blues , while the Fitts ' is kept in a " depressed military palette " . + Mendes ' dominating visual style was deliberate and composed , with a minimalist design that provided " a sparse , almost surreal feeling — a bright , crisp , hard edged , near Magritte @-@ like take on American suburbia " ; Mendes constantly directed his set dressers to empty the frame . He made Lester 's fantasy scenes " more fluid and graceful " , and Mendes made minimal use of steadicams , feeling that stable shots generated more tension . For example , when Mendes used a slow push in to the Burnhams ' dinner table , he held the shot because his training as a theater director taught him the importance of putting distance between the characters . He wanted to keep the tension in the scene , so he only cut away when Jane left the table . Mendes did use a hand @-@ held camera for the scene in which Col. Fitts beats Ricky . Mendes said the camera provided the scene with a " kinetic [ ... ] off @-@ balance energy " . He also went hand @-@ held for the excerpts of Ricky 's camcorder footage . Mendes took a long time to get the quality of Ricky 's footage to the level he wanted . For the plastic @-@ bag footage , Mendes used wind machines to move the bag in the air . The scene took four takes ; two by the second unit did not satisfy Mendes , so he shot the scene himself . He felt his first take lacked grace , but for the last attempt , he changed the location to the front of a brick wall and added leaves on the ground . Mendes was satisfied by the way the wall gave definition to the outline of the bag . + Mendes avoided using close @-@ ups , as he believed the technique was overused ; he also cited Spielberg 's advice that he should imagine an audience silhouetted at the bottom of the camera monitor , to keep in mind that he was shooting for display on a 40 @-@ foot ( 10 m ) screen . Spielberg — who visited the set a few times — also advised Mendes not to worry about costs if he had a " great idea " toward the end of a long working day . Mendes said , " That happened three or four times , and they are all in the movie . " Despite Spielberg 's support , DreamWorks and Mendes fought constantly over the schedule and budget , although the studio interfered little with the film 's content . Spacey , Bening and Hall worked for significantly less than their usual rates . American Beauty cost DreamWorks $ 15 million to produce , slightly above their projected sum . Mendes was so dissatisfied with his first three days ' filming that he obtained permission from DreamWorks to reshoot the scenes . He said , " I started with a wrong scene , actually , a comedy scene . And the actors played it way too big : [ ... ] it was badly shot , my fault , badly composed , my fault , bad costumes , my fault [ ... ] ; and everybody was doing what I was asking . It was all my fault . " Aware that he was a novice , Mendes drew on the experience of Hall : " I made a very conscious decision early on , if I didn 't understand something technically , to say , without embarrassment , ' I don 't understand what you 're talking about , please explain it . ' " + Mendes encouraged some improvisation ; for example , when Lester masturbates in bed beside Carolyn , the director asked Spacey to improvise several euphemisms for the act in each take . Mendes said , " I wanted that not just because it was funny [ ... ] but because I didn 't want it to seem rehearsed . I wanted it to seem like he was blurting it out of his mouth without thinking . [ Spacey ] is so in control — I wanted him to break through . " Spacey obliged , eventually coming up with 35 phrases , but Bening could not always keep a straight face , which meant the scene had to be shot 10 times . The production used small amounts of computer @-@ generated imagery . Most of the rose petals in Lester 's fantasies were added in post @-@ production , although some were real and had the wires holding them digitally removed . When Lester fantasizes about Angela in a rose @-@ petal bath , the steam was real , save for in the overhead shot . To position the camera , a hole had to be cut in the ceiling , through which the steam escaped ; it was instead added digitally . + + = = = Editing = = = + + American Beauty was edited by Christopher Greenbury and Tariq Anwar ; Greenbury began in the position , but had to leave halfway through post @-@ production because of a scheduling conflict with Me , Myself and Irene ( 2000 ) ( in which Chris Cooper also starred ) . Mendes and an assistant edited the film for 10 days between the appointments . Mendes realized during editing that the film was different from the one he had envisioned . He believed he had been making a " much more whimsical , [ ... ] kaleidoscopic " film than what came together in the edit suite . Instead , Mendes was drawn to the emotion and darkness ; he began to use the score and shots he had intended to discard to craft the film along these lines . In total , he cut about 30 minutes from his original edit . The opening included a dream in which Lester imagines himself flying above the town . Mendes spent two days filming Spacey against bluescreen , but removed the sequence as he believed it to be too whimsical — " like a Coen brothers movie " — and therefore inappropriate for the tone he was trying to set . The opening in the final cut reused a scene from the middle of the film where Jane tells Ricky to kill her father . This scene was to be the revelation to the audience that the pair was not responsible for Lester 's death , as the way it was scored and acted made it clear that Jane 's request was not serious . However , in the portion he used in the opening — and when the full scene plays out later — Mendes used the score and a reaction shot of Ricky to leave a lingering ambiguity as to his guilt . The subsequent shot — an aerial view of the neighborhood — was originally intended as the plate shot for the bluescreen effects in the dream sequence . + Mendes spent more time recutting the first 10 minutes than the rest of the film taken together . He trialled several versions of the opening ; the first edit included bookend scenes in which Jane and Ricky are convicted of Lester 's murder , but Mendes excised these in the last week of editing because he felt they made the film lose its mystery , and because they did not fit with the theme of redemption that had emerged during production . Mendes believed the trial drew focus away from the characters and turned the film " into an episode of NYPD Blue " . Instead , he wanted the ending to be " a poetic mixture of dream and memory and narrative resolution " . When Ball first saw a completed edit , it was a version with truncated versions of these scenes . He felt that they were so short that they " didn 't really register " . Mendes and he argued , but Ball was more accepting after Mendes cut the sequences completely ; Ball felt that without the scenes , the film was more optimistic and had evolved into something that " for all its darkness had a really romantic heart " . + + = = = Cinematography = = = + + Conrad Hall was not the first choice for director of photography ; Mendes believed he was " too old and too experienced " to want the job , and he had been told that Hall was difficult to work with . Instead , Mendes asked Fred Elmes , who turned the job down because he did not like the script . Hall was recommended to Mendes by Tom Cruise , because of Hall 's work on Without Limits ( 1998 ) , which Cruise had executive produced . Mendes was directing Cruise 's then @-@ wife Nicole Kidman in the play The Blue Room during preproduction on American Beauty , and had already storyboarded the whole film . Hall was involved for one month during preproduction ; his ideas for lighting the film began with his first reading of the script , and further passes allowed him to refine his approach before meeting Mendes . Hall was initially concerned that audiences would not like the characters ; he only felt able to identify with them during cast rehearsals , which gave him fresh ideas on his approach to the visuals . + Hall 's approach was to create peaceful compositions that evoked classicism , to contrast with the turbulent on @-@ screen events and allow audiences to take in the action . Hall and Mendes first discussed the intended mood of a scene , but he was allowed to light the shot in any way he felt necessary . In most cases , Hall first lit the scene 's subject by " painting in " the blacks and whites , before adding fill light , which he reflected from beadboard or white card on the ceiling . This approach gave Hall more control over the shadows while keeping the fill light unobtrusive and the dark areas free of spill . Hall shot American Beauty in a 2 @.@ 39 : 1 aspect ratio in the Super 35 format , using Kodak Vision 500T 5279 35 mm film stock . He used Super 35 partly because its larger scope allowed him to capture elements such as the corners of the petal @-@ filled pool in its overhead shot , creating a frame around Angela within . He shot the whole film at the same T @-@ stop ( T1.9 ) ; given his preference for shooting that wide , Hall favored high @-@ speed stocks to allow for more subtle lighting effects . He used Panavision Platinum cameras with the company 's Primo series of prime and zoom lenses . Hall employed Kodak Vision 200T 5274 and EXR 5248 stock for scenes with daylight effects . He had difficulty adjusting to Kodak 's newly introduced Vision release print stock , which , combined with his contrast @-@ heavy lighting style , created a look with too much contrast . Hall contacted Kodak , who sent him a batch of 5279 that was 5 % lower in contrast . Hall used a 1 / 8 inch Tiffen Black ProMist filter for almost every scene , which he said in retrospect may not have been the best choice , as the optical steps required to blow Super 35 up for its anamorphic release print led to a slight amount of degradation ; therefore , the diffusion from the filter was not required . When he saw the film in a theater , Hall felt that the image was slightly unclear and that had he not used the filter , the diffusion from the Super 35 – anamorphic conversion would have generated an image closer to what he originally intended . + A shot where Lester and Ricky share a cannabis joint behind a building came from a misunderstanding between Hall and Mendes . Mendes asked Hall to prepare the shot in his absence ; Hall assumed the characters would look for privacy , so he placed them in a narrow passage between a truck and the building , intending to light from the top of the truck . When Mendes returned , he explained that the characters did not care if they were seen . He removed the truck and Hall had to rethink the lighting ; he lit it from the left , with a large light crossing the actors , and with a soft light behind the camera . Hall felt the consequent wide shot " worked perfectly for the tone of the scene " . Hall made sure to keep rain , or the suggestion of it , in every shot near the end of the film . In one shot during Lester 's encounter with Angela at the Burnhams ' home , Hall created rain effects on the foreground cross lights ; in another , he partly lit the pair through French windows to which he had added material to make the rain run slower , intensifying the light ( although the strength of the outside light was unrealistic for a night scene , Hall felt it justified because of the strong contrasts it produced ) . For the close @-@ ups when Lester and Angela move to the couch , Hall tried to keep rain in the frame , lighting through the window onto the ceiling behind Lester . He also used rain boxes to produce rain patterns where he wanted without lighting the entire room . + + = = = Music = = = + + Thomas Newman 's score was recorded in Santa Monica , California . He mainly used percussion instruments to create the mood and rhythm , the inspiration for which was provided by Mendes . Newman " favored pulse , rhythm , and color over melody " , making for a more minimalist score than he had previously created . He built each cue around " small , endlessly repeating phrases " — often , the only variety through a " thinning of the texture for eight bars " . The percussion instruments included tablas , bongos , cymbals , piano , xylophones , and marimbas ; also featured were guitars , flute , and world music instruments . Newman also used electronic music and on " quirkier " tracks employed more unorthodox methods , such as tapping metal mixing bowls with a finger and using a detuned mandolin . Newman believed the score helped move the film along without disturbing the " moral ambiguity " of the script : " It was a real delicate balancing act in terms of what music worked to preserve [ that ] . " + The soundtrack features songs by Newman , Bobby Darin , The Who , Free , Eels , The Guess Who , Bill Withers , Betty Carter , Peggy Lee , The Folk Implosion , Gomez , and Bob Dylan , as well as two cover versions — The Beatles ' " Because " performed by Elliott Smith , and Neil Young 's " Don 't Let It Bring You Down " performed by Annie Lennox . Produced by the film 's music supervisor Chris Douridas , an abridged soundtrack album was released on October 5 , 1999 , and went on to be nominated for a Grammy Award for Best Soundtrack Album . An album featuring 19 tracks from Newman 's score was released on January 11 , 2000 , and won the Grammy Award for Best Score Soundtrack Album . Filmmaker considered the score to be one of Newman 's best , saying it " [ enabled ] the film 's transcendentalist aspirations " . In 2006 , the magazine chose the score as one of 20 essential soundtracks it believed spoke to the " complex and innovative relationships between music and screen storytelling " . + + = = Release = = + + + = = = Publicity = = = + + DreamWorks contracted Amazon.com to create the official website , marking the first time that Amazon had created a special section devoted to a feature film . The website included an overview , a photo gallery , cast and crew filmographies , and exclusive interviews with Spacey and Bening . The film 's tagline — " look closer " — originally came from a cutting pasted on Lester 's workplace cubicle by the set dresser . DreamWorks ran parallel marketing campaigns and trailers — one aimed at adults , the other at teenagers . Both trailers ended with the poster image of a girl holding a rose . Reviewing the posters of several 1999 films , David Hochman of Entertainment Weekly rated American Beauty 's highly , saying it evoked the tagline ; he said , " You return to the poster again and again , thinking , this time you 're gonna find something . " DreamWorks did not want to test screen the film ; according to Mendes , the studio was pleased with it , but he insisted on one where he could question the audience afterward . The studio reluctantly agreed and showed the film to a young audience in San Jose , California . Mendes claimed the screening went very well . + + = = = Theatrical run = = = + + The film had its world premiere on September 8 , 1999 , at Grauman 's Egyptian Theatre in Los Angeles . Three days later , the film appeared at the Toronto International Film Festival . With the filmmakers and cast in attendance , it screened at several American universities , including the University of California at Berkeley , New York University , the University of California at Los Angeles , the University of Texas at Austin , and Northwestern University . + On September 15 , 1999 , American Beauty opened to the public in limited release at three theaters in Los Angeles and three in New York . More theaters were added during the limited run , and on October 1 , the film officially entered wide release by screening in 706 theaters across North America . The film grossed $ 8 @,@ 188 @,@ 587 over the weekend , ranking third at the box office . Audiences polled by the market research firm CinemaScore gave American Beauty a " B + " grade on average . The theater count hit a high of 1 @,@ 528 at the end of the month , before a gradual decline . Following American Beauty 's wins at the 57th Golden Globe Awards , DreamWorks re @-@ expanded the theater presence from a low of 7 in mid @-@ February , to a high of 1 @,@ 990 in March . The film ended its North American theatrical run on June 4 , 2000 , having grossed $ 130 @.@ 1 million . + American Beauty had its European premiere at the London Film Festival on November 18 , 1999 ; in January 2000 , it began to screen in various territories outside North America . It debuted in Israel to " potent " returns , and limited releases in Germany , Italy , Austria , Switzerland , the Netherlands and Finland followed on January 21 . After January 28 opening weekends in Australia , the United Kingdom , Spain and Norway , American Beauty had earned $ 7 million in 12 countries for a total of $ 12 @.@ 1 million outside North America . On February 4 , American Beauty debuted in France and Belgium . Expanding to 303 theaters in the United Kingdom , the film ranked first at the box office with $ 1 @.@ 7 million . On the weekend of February 18 — following American Beauty 's eight nominations for the 72nd Academy Awards — the film grossed $ 11 @.@ 7 million from 21 territories , for a total of $ 65 @.@ 4 million outside North America . The film had " dazzling " debuts in Hungary , Denmark , the Czech Republic , Slovakia , and New Zealand . + As of February 18 , the most successful territories were the United Kingdom ( $ 15 @.@ 2 million ) , Italy ( $ 10 @.@ 8 million ) , Germany ( $ 10 @.@ 5 million ) , Australia ( $ 6 million ) , and France ( $ 5 @.@ 3 million ) . The Academy Award nominations meant strong performances continued across the board ; the following weekend , American Beauty grossed $ 10 @.@ 9 million in 27 countries , with strong debuts in Brazil , Mexico , and South Korea . Other high spots included robust returns in Argentina , Greece , and Turkey . On the weekend of March 3 , 2000 , American Beauty debuted strongly in Hong Kong , Taiwan , and Singapore , markets traditionally " not receptive to this kind of upscale fare " . The impressive South Korean performance continued , with a return of $ 1 @.@ 2 million after nine days . In total , American Beauty grossed $ 130 @.@ 1 million in North America and $ 226 @.@ 2 million internationally , for $ 356 @.@ 3 million worldwide . + + = = = Home media = = = + + American Beauty was released on VHS on May 9 , 2000 , and on DVD with the DTS format on October 24 , 2000 . Before the North American rental release on May 9 , Blockbuster Video wanted to purchase hundreds of thousands of extra copies for its " guaranteed title " range , whereby anyone who wanted to rent the film would be guaranteed a copy . Blockbuster and DreamWorks could not agree on a profit @-@ sharing deal , so Blockbuster ordered two @-@ thirds the number of copies it originally intended . DreamWorks made around one million copies available for rental ; Blockbuster 's share would usually have been about 400 @,@ 000 of these . Some Blockbuster stores only displayed 60 copies , and others did not display the film at all , forcing customers to ask for it . The strategy required staff to read a statement to customers explaining the situation ; Blockbuster claimed it was only " [ monitoring ] customer demand " due to the reduced availability . Blockbuster 's strategy leaked before May 9 , leading to a 30 % order increase from other retailers . In its first week of rental release , American Beauty made $ 6 @.@ 8 million . This return was lower than would have been expected had DreamWorks and Blockbuster reached an agreement . In the same year , The Sixth Sense made $ 22 million , while Fight Club made $ 8 @.@ 1 million , though the latter 's North American theatrical performance was just 29 % that of American Beauty . Blockbuster 's strategy also affected rental fees ; American Beauty averaged $ 3 @.@ 12 , compared with $ 3 @.@ 40 for films that Blockbuster fully promoted . Only 53 % of the film 's rentals were from large outlets in the first week , compared with the usual 65 % . + The DVD release included a behind @-@ the @-@ scenes featurette , film audio commentary from Mendes and Ball , and a storyboard presentation with discussion from Mendes and Hall . In the film commentary , Mendes refers to deleted scenes he intended to include in the release . However , these scenes are not on the DVD , as he changed his mind after recording the commentary ; Mendes felt that to show scenes he previously chose not to use would detract from the film 's integrity . + On September 21 , 2010 , Paramount Home Entertainment released American Beauty on Blu @-@ ray , as part of Paramount 's Sapphire Series . All the extras from the DVD release were present , with the theatrical trailers upgraded to HD . + + = = Critical reception = = + + American Beauty was widely considered the best film of 1999 by the American press . It received overwhelming praise , chiefly for Spacey , Mendes and Ball . Variety reported that " no other 1999 movie has benefited from such universal raves . " It was the best @-@ received title at the Toronto International Film Festival ( TIFF ) , where it won the People 's Choice award after a ballot of the festival 's audiences . TIFF 's director , Piers Handling , said , " American Beauty was the buzz of the festival , the film most talked @-@ about . " + Writing in Variety , Todd McCarthy said the cast ensemble " could not be better " ; he praised Spacey 's " handling of innuendo , subtle sarcasm , and blunt talk " and the way he imbued Lester with " genuine feeling " . Janet Maslin in The New York Times said Spacey was at his " wittiest and most agile " to date , and Roger Ebert of the Chicago Sun @-@ Times singled Spacey out for successfully portraying a man who " does reckless and foolish things [ but who ] doesn 't deceive himself " . Kevin Jackson of Sight & Sound said Spacey impressed in ways distinct from his previous performances , the most satisfying aspect being his portrayal of " both sap and hero " . Writing in Film Quarterly , Gary Hentzi praised the actors , but said that characters such as Carolyn and Col. Fitts were stereotypes . Hentzi accused Mendes and Ball of identifying too readily with Jane and Ricky , saying the latter was their " fantasy figure " — a teenaged boy who 's an absurdly wealthy artist able to " finance [ his ] own projects " . Hentzi said Angela was the most believable teenager , in particular with her " painfully familiar " attempts to " live up to an unworthy image of herself " . Maslin agreed that some characters were unoriginal , but said their detailed characterizations made them memorable . Kenneth Turan of the Los Angeles Times said the actors coped " faultlessly " with what were difficult roles ; he called Spacey 's performance " the energy that drives the film " , saying the actor commanded audience involvement despite Lester not always being sympathetic . " Against considerable odds , we do like [ these characters ] , " Turan concluded . + Maslin felt that Mendes directed with " terrific visual flair " , saying his minimalist style balanced " the mordant and bright " and that he evoked the " delicate , eroticized power @-@ playing vignettes " of his theater work . Jackson said Mendes ' theatrical roots rarely showed , and that the " most remarkable " aspect was that Spacey 's performance did not overshadow the film . He said that Mendes worked the script 's intricacies smoothly , to the ensemble 's strengths , and staged the tonal shifts skillfully . McCarthy believed American Beauty a " stunning card of introduction " for film débutantes Mendes and Ball . He said Mendes ' " sure hand " was " as precise and controlled " as his theater work . McCarthy cited Hall 's involvement as fortunate for Mendes , as the cinematographer was " unsurpassed " at conveying the themes of a work . Turan agreed that Mendes ' choice of collaborators was " shrewd " , naming Hall and Newman in particular . Turan suggested that American Beauty may have benefited from Mendes ' inexperience , as his " anything 's possible daring " made him attempt beats that more seasoned directors might have avoided . Turan felt that Mendes ' accomplishment was to " capture and enhance [ the ] duality " of Ball 's script — the simultaneously " caricatured [ ... ] and painfully real " characters . Hentzi , while critical of many of Mendes and Ball 's choices , admitted the film showed off their " considerable talents " . + Turan cited Ball 's lack of constraint when writing the film as the reason for its uniqueness , in particular the script 's subtle changes in tone . McCarthy said the script was " as fresh and distinctive " as any of its American film contemporaries , and praised how it analyzed the characters while not compromising narrative pace . He called Ball 's dialogue " tart " and said the characters — Carolyn excepted — were " deeply drawn " . One other flaw , McCarthy said , was the revelation of Col. Fitts ' homosexuality , which he said evoked " hoary Freudianism " . Jackson said the film transcended its clichéd setup to become a " wonderfully resourceful and sombre comedy " . He said that even when the film played for sitcom laughs , it did so with " unexpected nuance " . Hentzi criticized how the film made a mystery of Lester 's murder , believing it manipulative and simply a way of generating suspense . McCarthy cited the production and costume design as pluses , and said the soundtrack was good at creating " ironic counterpoint [ s ] " to the story . Hentzi concluded that American Beauty was " vital but uneven " ; he felt the film 's examination of " the ways which teenagers and adults imagine each other 's lives " was its best point , and that although Lester and Angela 's dynamic was familiar , its romantic irony stood beside " the most enduring literary treatments " of the theme , such as Lolita . Nevertheless , Hentzi believed that the film 's themes of materialism and conformity in American suburbia were " hackneyed " . McCarthy conceded that the setting was familiar , but said it merely provided the film with a " starting point " from which to tell its " subtle and acutely judged tale " . Maslin agreed ; she said that while it " takes aim at targets that are none too fresh " , and that the theme of nonconformity did not surprise , the film had its own " corrosive novelty " . Ebert awarded American Beauty four stars out of four , and Turan said it was layered , subversive , complex , and surprising , concluding it was " a hell of a picture " . + A few months after the film 's release , reports of a backlash appeared in the American press , and the years since have seen its critical regard wane . In 2005 , Premiere named American Beauty as one of 20 " most overrated movies of all time " ; Mendes accepted the inevitability of the critical reappraisal , saying , " I thought some of it was entirely justified — it was a little overpraised at the time . " + Currently , the film holds an 88 % score on Rotten Tomatoes based on 180 reviews , with an average rating of 8 @.@ 2 / 10 ; the critical consensus reads , " Flawlessly cast and brimming with dark , acid wit , American Beauty is a smart , provocative high point of late ' 90s mainstream Hollywood film . " Metacritic gives the film a score of 86 , based on 33 reviews , indicating " universal acclaim . " + + = = Accolades = = + + American Beauty was not considered an immediate favorite to dominate the American awards season . Several other contenders opened at the end of 1999 , and US critics spread their honors among them when compiling their end @-@ of @-@ year lists . The Chicago Film Critics Association and the Broadcast Film Critics Association named the film the best of 1999 , but while the New York Film Critics Circle , the National Society of Film Critics and the Los Angeles Film Critics Association recognized American Beauty , they gave their top awards to other films . By the end of the year , reports of a critical backlash suggested American Beauty was the underdog in the race for Best Picture ; however , at the Golden Globe Awards in January 2000 , American Beauty won Best Film , Best Director and Best Screenplay . + As the nominations for the 72nd Academy Awards approached , a frontrunner had not emerged . DreamWorks had launched a major campaign for American Beauty five weeks before ballots were due to be sent to the 5 @,@ 600 Academy Award voters . Its campaign combined traditional advertising and publicity with more focused strategies . Although direct mail campaigning was prohibited , DreamWorks reached voters by promoting the film in " casual , comfortable settings " in voters ' communities . The studio 's candidate for Best Picture the previous year , Saving Private Ryan , lost to Shakespeare in Love , so the studio took a new approach by hiring outsiders to provide input for the campaign . It hired three veteran consultants , who told the studio to " think small " . Nancy Willen encouraged DreamWorks to produce a special about the making of American Beauty , to set up displays of the film in the communities ' bookstores , and to arrange a question @-@ and @-@ answer session with Mendes for the British Academy of Film and Television Arts . Dale Olson advised the studio to advertise in free publications that circulated in Beverly Hills — home to many voters — in addition to major newspapers . Olson arranged to screen American Beauty to about 1 @,@ 000 members of the Actors Fund of America , as many participating actors were also voters . Bruce Feldman took Ball to the Santa Barbara International Film Festival , where Ball attended a private dinner in honor of Anthony Hopkins , meeting several voters who were in attendance . + In February 2000 , American Beauty was nominated for eight Academy Awards ; its closest rivals , The Cider House Rules and The Insider , received seven nominations each . In March 2000 , the major industry labor organizations all awarded their top honors to American Beauty ; perceptions had shifted — the film was now the favorite to dominate the Academy Awards . American Beauty 's closest rival for Best Picture was still The Cider House Rules , from Miramax . Both studios mounted aggressive campaigns ; DreamWorks bought 38 % more advertising space in Variety than Miramax . On March 26 , 2000 , American Beauty won five Academy Awards : Best Picture , Best Director , Best Actor ( Spacey ) , Best Original Screenplay and Best Cinematography . At the 53rd British Academy Film Awards , American Beauty won six of the 14 awards for which it was nominated : Best Film , Best Actor , Best Actress ( Bening ) , Best Cinematography , Best Film Music and Best Editing . In 2000 , the Publicists Guild of America recognized DreamWorks for the best film publicity campaign . In September 2008 , Empire named American Beauty the 96th " Greatest Movie of All Time " after a poll of 10 @,@ 000 readers , 150 filmmakers , and 50 film critics , the fourth @-@ highest ranked movie from 1999 ( behind Fight Club , The Matrix , and Magnolia ) . In 2013 , the Writers Guild of America ranked the screenplay number 38 on its list of 101 greatest screenplays . + The film was nominated for AFI 's 100 Years ... 100 Movies ( 10th Anniversary Edition ) in 2007 . + + + = Christopher Gore = + + Christopher Gore ( September 21 , 1758 – March 1 , 1827 ) was a prominent Massachusetts lawyer , Federalist politician , and U.S. diplomat . Born into a family divided by the American Revolution , Gore sided with the victorious Patriots , established a successful law practice in Boston , and built a fortune by purchasing Revolutionary government debts at a discount and receiving full value for them from the government . + Gore entered politics in 1788 , serving briefly in the Massachusetts legislature before being appointed U.S. District Attorney for Massachusetts . He was then appointed by President George Washington to a diplomatic commission dealing with maritime claims in Great Britain . He returned to Massachusetts in 1804 and reentered state politics , running unsuccessfully for governor several times before winning in 1809 . He served one term , losing to Democratic @-@ Republican Elbridge Gerry in 1810 . He was appointed to the US Senate by Governor Caleb Strong in 1813 , where he led opposition to the War of 1812 . + Gore invested his fortune in a variety of businesses , including important infrastructure projects such as the Middlesex Canal and a bridge across the Charles River . He was a major investor in the early textile industry , funding the Boston Manufacturing Company and the Merrimack Manufacturing Company , whose business established the city of Lowell , Massachusetts . Gore was involved in a variety of charitable causes , and was a major benefactor of Harvard College , where the first library was named in his honor . His palatial mansion in Waltham , Massachusetts , now known as Gore Place , is one of the finest extant examples of Federalist architecture , and has been declared a National Historic Landmark . + + = = Early years = = + + Christopher Gore was born in Boston on September 21 , 1758 , one of many children of Frances and John Gore , a successful merchant and artisan . He was the youngest of their three sons to survive to adulthood . He attended Boston Latin School , and entered Harvard College at the young ( even for the time ) age of thirteen . At the outset of the American Revolutionary War and the Siege of Boston in 1775 , Harvard 's buildings were occupied by the Continental Army , and Gore temporarily continued his studies in Bradford until Harvard could resume operations in Concord . While at Harvard Gore participated in a speaking club , and formed significant lifelong friendships with Rufus King and John Trumbull . + Gore graduated in 1776 , and promptly enlisted in the Continental artillery regiment of his brother @-@ in @-@ law Thomas Crafts , where he served as a clerk until 1778 . The Gore family was divided by the war : Gore 's father was a Loyalist who left Boston when the British Army evacuated the city in March 1776 . Gore was consequently called upon to support his mother and three sisters , who remained in Boston . In 1779 Gore successfully petitioned the state for the remaining family 's share of his father 's seized assets . + + = = Early legal career = = + + After his military service Gore studied law with John Lowell , and was admitted to the bar in 1778 after a comparatively brief tutelage . Gore 's law practice flourished , in part because many Loyalist lawyers had fled Massachusetts . Gore 's clients included Loyalists seeking to recover some of their assets , as well as London @-@ based British merchants with claims to pursue . His briefs were generally well @-@ reasoned , and he was seen as a successful trial lawyer . + Gore grew his fortune by investing carefully in revolutionary currency and bonds . The securities he purchased were paper that had been given to Continental Army soldiers in lieu of pay , which they often sold at a steep discount . One batch of securities he purchased , for instance , cost him about $ 3 @,@ 700 but had a face value of $ 25 @,@ 000 . In 1785 he married Rebecca Amory Payne , daughter of a wealthy merchant , maritime insurer , and director of the Bank of Massachusetts . The couple were known for their social graces and became prominent members of Boston society . + In 1786 Gore became concerned about a rise in anti @-@ lawyer sentiment in Massachusetts . Grievances over harsh policies pursued by Governor James Bowdoin blossomed into Shays ' Rebellion , which required militia action to crush in 1787 . Gore was one of several high @-@ profile lawyers assigned to defend participants in the rebellion ( included in this group were Theodore Sedgwick , Caleb Strong , James Sullivan , Levi Lincoln , Sr. , and Thomas Dawes ) . Although many rebels were ultimately convicted , a large number received amnesty . In 1788 , Gore was elected a delegate to the 1789 Massachusetts convention that ratified the United States Constitution . His election was contested because Boston , where he lived , was at the time more inclined toward state power . Gore nonetheless was strongly Federalist , urging support of the new Constitution . + + = = Legislator , banker , and speculator = = + + In 1788 Gore was elected to the Massachusetts House of Representatives . He took a leading role in adopting the state 's rules for actions required of it by the new federal constitution . By his proposal the legislature decided that presidential electors would be chosen by a joint session . He also proposed that the state House and Senate agree by separate votes on choices for the United States Senate , a process that would significantly reduce popular input to the choice . His choice was ultimately rejected in favor of a process whereby the House selected a slate of candidates , from which the Senate would choose one . In 1789 Gore decided to stand for reelection , but lost , owing to strong anti @-@ nationalist fervor in Boston at the time . He managed to win a seat later , when a special election was held after resignations opened several seats . + Gore 's financial speculations in the late 1780s significantly multiplied his wealth . In 1788 he and Andrew Craigie , a Boston businessman who had retained Gore for legal services , entered into a secret agreement to purchase Continental securities with a face value of $ 100 @,@ 000 in a speculative bid that their value would rise . By late October of that year , the pair had met this goal : Gore had purchased $ 90 @,@ 000 worth of paper for about $ 20 @,@ 000 , and encouraged Craigie to purchase more than the $ 11 @,@ 000 he had acquired if his funding would allow for it . Gore also purchased Massachusetts war @-@ related debts , and lobbied Massachusetts Congressmen for the U.S. government to assume those as well . + Gore 's windfall was realized when in 1790 the United States Congress , acting on a proposal made by Alexander Hamilton and supported by Gore 's friend Rufus King , passed legislation that exchanged Continental and state paper for new U.S. paper at face value . Not only did Gore win on this exchange , but the paper he received appreciated in value before he sold it . The exact amount he made is unclear from the surviving documents : John Quincy Adams wrote that Gore 's speculations made him the wealthiest lawyer in the country . + The success of Gore 's speculations prompted him to enter a partnership with Craigie , William Duer and Daniel Parker in an attempt to acquire U.S. foreign debt obligations on favorable terms . Parker was a business partner of Craigie 's , and Duer was an influential New York businessman and Treasury Department official whose lavish lifestyle impressed Gore . The partnership promoted sales of U.S. lands in Europe , and sought to acquire U.S. obligations to France . Although Gore sank $ 10 @,@ 000 into this venture , it failed : more powerful and experienced Dutch bankers outmaneuvered the Americans . Gore also engaged in other ventures with these partners , but apparently carefully stayed with financial speculations , and avoided the partners ' less successful land ventures . + Much of Gore 's financial activity was mediated through the Bank of Massachusetts , where his father @-@ in @-@ law was a director . Gore himself was elected to its board in 1785 , when he also became a shareholder . During his time on the board the bank tightened its regulations on loan repayments , a move that improved the stability of its capital . Gore used the bank for most of his personal deposits , but also drew on lines of credit for as much as several thousand dollars . The bank shares he held paid relatively high dividends until 1791 , when the bank received serious competition from the First Bank of the United States . + The Bank of the United States was established by Alexander Hamilton to provide stable banking services on a national scale , and sought to open a branch in Boston . Hamilton recruited heavily in the Bank of Massachusetts , and Gore decided to make the move . He sold his shares in the Massachusetts bank , and became a director of the Boston branch of the U.S. Bank . He also purchased 200 shares in the new bank , a relatively large investment . Gore was influential in making hiring decisions for the branch , and sought to merge state @-@ chartered banks into the organization , arguing that only a nationally chartered bank could provide consistent and stable service . Gore resigned from the board in 1794 , citing the demands of his law practice . + Gore 's financial successes enabled him to join the elite society of Boston . In 1789 he purchased a large mansion on fashionable Bowdoin Square , and also bought a country estate in Waltham that grew over time to 300 acres ( 120 ha ) . He had a house built on the estate , most of which he operated as a gentleman farmer . He and other similarly @-@ situated Federalists formed the Massachusetts Society for Promoting Agriculture , of which he served as a trustee for several years ; the organization was not seen as significantly contributing to advances in agriculture . + + = = District attorney and diplomat = = + + In 1789 President George Washington appointed Gore the first United States Attorney for Massachusetts as a reward for his support . Gore controversially refused to resign from the state legislature , arguing that the state constitution 's prohibitions against holding multiple offices did not apply to federal posts . He eventually resigned the legislative seat under protest because of pressure from his fellow legislators . + Gore served as district attorney until 1796 . His principal matter of concern was the enforcement of U.S. neutrality with respect to the French Revolutionary Wars . He attempted several times to prosecute the French consul in Boston , Antoine Duplaine , for arming and operating privateers out of the Port of Boston , but he was stymied by local juries that sympathized with the French . Duplaine was eventually expelled on orders from President George Washington based on evidence provided by Gore . + Gore also promoted anti @-@ French sentiment with political writings in Massachusetts newspapers . Writing under the pseudonym " Manlius " , he denounced the formation of " Democratic Societies " formed to oppose Federalist policy and support pro @-@ French positions . He suggested to President Washington that someone be sent to England to negotiate with the British . John Jay traveled to London in 1794 and negotiated the Jay Treaty , whose ratification Gore vocally supported . Although Gore was hostile to French policy , he was on friendly terms with individual Frenchmen : he hosted the future French statesman Talleyrand when he visited the U.S. + In 1796 Washington appointed him as a commissioner representing the United States to handle maritime claims under the terms the Jay Treaty . As a result the Gores moved to England that year , establishing a residence in the fashionable Hyde Park area . The commission was established to arbitrate claims emanating from British seizures of American vessels and cargoes , and from British claims relating to violations of American neutrality in the ongoing French Revolutionary Wars . It consisted of three Americans ( Gore , William Pinkney , and John Trumbull ) and two British commissioners ( John Nicoll and Nicholas Astley ) ; Trumbull was chosen by the other four because he was deemed to sufficiently " fair @-@ minded " to cast deciding votes in the event of disagreements . That year he was also elected a Fellow of the American Academy of Arts and Sciences . + Although Gore was well received by the British establishment , the work suffered from what Gore called a " tediousness of process " , and he considered requesting a transfer in 1798 . In 1800 it ground to a halt because another board established by the treaty to resolve outstanding Revolutionary War claims against the United States had not yet met , and the British stopped the claims processing until resolution of the other issues got underway . Gore used this break to briefly return to America and assess the condition of his Waltham estate , where the house had been largely destroyed by fire in 1799 . After his return to London , with the commission work still stopped , he and Rebecca embarked on a tour of Europe . They visited Holland , Belgium , and Switzerland , and spent six months in Paris . During this trip , and later ones in England and Scotland , they took note of the architecture of country estates , and began planning a new house for their Waltham property . + The commission resumed its work in early 1802 , and had resolved all outstanding claims by August 1803 . It awarded $ 110 @,@ 000 to British claimants and over $ 6 million to American claimants . The lopsided result is due to the vastly larger number of American claims , but also to some key early decisions that favored American interpretations in the processing of the claims , and by a British administration that sought to remain in America 's good graces . + The Gore 's social circle in England revolved around his good friend Rufus King , who was appointed Ambassador to Great Britain in 1796 , along with other Massachusetts expatriates . When King left his post in May 1803 he named Gore to head the London embassy as chargé d 'affaires . Although President Thomas Jefferson never issued a formal appointment , the British government accepted his role for the two @-@ month interval between King 's departure and the arrival of James Monroe as King 's replacement . The Gores sailed for Boston in the spring of 1804 . + Rebecca Gore used their exposure to European country estates to design a lavish new building for their Waltham estate during their English sojourn . Designed with the assistance of French architect Joseph @-@ Guillaume Legrand and probably also influenced by the works of English architect Sir John Soane , the house that was built upon their return to the United States in 1804 ( now known as Gore Place ) is one of the finest extant examples of Federalist architecture . + + = = Lawyer and state legislator = = + + Soon after his return to the United States , Gore reentered state politics , winning election to the Massachusetts Senate . He was active in the state Federalist Party organization , sitting on its secret central committee . He resumed his law practice , in which he took on as a student Daniel Webster . One of the highest profile cases he took on was the 1807 defense of Thomas Selfridge , accused of murdering Charles Austin . Selfridge , an older Federalist attorney , had been retained to assist in the collection of a debt from Austin 's Republican father . In the politically charged atmosphere of the day in Boston , Selfridge , fearing for his own safety , had armed himself with a dueling pistol . The younger Austin had , apparently on his own initiative , sought to beat Selfridge with a cane , and Selfridge fatally shot him in the encounter . Selfridge was prosecuted by Attorney General ( and future Gore gubernatorial opponent ) James Sullivan , and the defense also included arch @-@ Federalist Harrison Gray Otis . Gore argued Selfridge acted in self @-@ defense ; Selfridge was acquitted of murder by a jury whose foreman was Patriot and Federalist Paul Revere after fifteen minutes ' deliberation . + Gore also resumed business activities upon his return . He invested in a wide variety of businesses and infrastructure , spurring economic activity in the state . His investments ranged widely , including maritime insurance ( where is father @-@ in @-@ law had made his fortune ) , bridges , locks , canals , and textiles . He was a major investor in the Middlesex Canal , the Craigie Bridge ( the first to connect Boston to Cambridge ) , and the Boston Manufacturing Company , whose factory proving the single @-@ site production of textiles was in Waltham near his estate . Not all of his ventures panned out : the canal was in the long run a financial failure , as were efforts with other collaborators to develop Lechmere Point , the Cambridge side of the Craigie Bridge . The textile mill , however , was a success , and Gore invested in the Merrimack Manufacturing Company . When it decided to locate in what is now Lowell , Massachusetts , Gore purchased shares in the Proprietors of Locks and Canals , which operated ( and still owns today ) the Lowell canals . + In 1806 Gore won election to the State Senate . That year the Republicans were in the majority , and the election for governor was close enough to require a recount . The legislature scrutinized the ballots in a partisan manner ( for example , retaining ballots containing misspelled versions of Republican James Sullivan 's name and discarding similar ballots marked for Federalist Caleb Strong ) . Gore and other Federalists raised a public outcry , and the legislature relented , eventually certifying Strong as the winner . + Gore ran unsuccessfully for Governor of Massachusetts in 1807 and 1808 against a rising tide of Republicanism in the state , losing both times to moderate Republican James Sullivan . The Federalists gained control of the state legislature in 1808 in a backlash against Republican economic policies , but Gore was criticized for his failure to aggressively support state protests against the Embargo Act of 1807 , which had a major negative effect on the state 's large merchant fleet . Gore was in 1808 elected to the Massachusetts House of Representatives , where he successfully led Federalist efforts to ensure the selection of a Federalist slate of presidential electors . He also spearheaded actions to drive Senator John Quincy Adams from the Federalist Party over his support of Thomas Jefferson 's foreign policy . The legislature elected Adams ' successor nine months early , and gave Adams sufficiently distasteful instructions that he resigned the post and joined with the Republicans . + + = = Governor = = + + Gore led the Federalists to victory in 1809 against Sullivan 's successor , Levi Lincoln , Sr. , who had taken over as acting governor upon Sullivan 's death late in 1808 . During Gore 's term the principal domestic issue occupying state politics was a banking crisis stimulated by the federal policy of embargoing trade with Great Britain and France , then embroiled in the Napoleonic Wars . Although the crisis caused a number of bank failures in New England , Massachusetts banks largely escaped unscathed . + Foreign policy played a major role in Gore 's administration . The legislature passed resolves opposing the federal government 's hardline policy against trade and diplomatic relations with the United Kingdom ( then embroiled in the Napoleonic Wars ) , and Gore in early 1810 invited Francis James Jackson , who had been rejected as the UK 's ambassador to the US , to visit the state . This pressure may have played a role in President James Madison 's decision to renew relations with the UK and accept Jackson 's credentials . + The lessening of the war threat , and the choice by the Republicans of the popular Elbridge Gerry as their candidate brought a challenge to Federalist control of Massachusetts in the 1810 elections . The unostentatious Gerry and Republican partisans criticized Gore for his lavish lifestyle , including his palatial Waltham residence and pompous activities he organized as governor , and highlighted his Loyalist family connections while emphasizing Gerry 's unimpeachable patriotism . Gerry won the election . Jackson did visit Boston , but he was greeted not by Gore , but Gerry . Gore ran against Gerry again in 1811 , but lost in another acrimonious campaign . + Gore was granted an honorary law degree from Harvard in 1809 . He served on the college 's Board of Overseers from 1810 to 1815 and as a Fellow from 1816 to 1820 . Harvard 's first library building , a Gothic structure built in 1838 of Quincy granite , was named in his honor , but was demolished when Widener Library was built in its place in 1915 . ( This structure is found on the seal of the city of Cambridge . ) One of the residential Winthrop House 's buildings is called Gore Hall in his honor . + + = = United States Senator = = + + In the spring of 1813 , he was appointed by Governor Caleb Strong to fill the U.S. Senate seat vacated by the resignation of Senator James Lloyd . He served from May 5 , 1813 to May 30 , 1816 , winning reelection to the seat in 1814 . He opposed the ongoing War of 1812 in these years , with his earlier diplomatic experience providing valuable knowledge to Federalist interests . He expressed approval of the 1814 Hartford Convention in which the New England states aired grievances concerning Republican governance of the country and the conduct of the war . + Gore assented to the Treaty of Ghent that ended the war , but was unhappy that the nation had not gained anything from the war . He resigned in June 1816 , unhappy with the politics of Washington and suffering from poor health . Although he was no longer active in politics , he continued to express opinions on the subjects of the day , opposing the 1820 Missouri Compromise and bemoaning the " great moderation & mediocrity " of Federalist Governor John Brooks . + + = = Later years and legacy = = + + Gore remained active in the administration of Harvard , and was active in a number of organizations , including the American Academy of Arts and Sciences and the Massachusetts Historical Society ( whose president he was from 1806 to 1818 ) . He was also elected a member of the American Antiquarian Society in 1814 . Gore spent most of his later years at his country estate in Waltham , suffering from worsening rheumatoid arthritis that made walking increasingly difficult . His declining health and lack of social scene in Waltham led him in 1822 to return to Boston in the winters . He died on March 1 , 1827 in Boston and is buried in its Granary Burying Ground . + Gore 's wife died in 1834 ; the couple had no children . The major beneficiary of the Gore estate was Harvard ( which received an estimated $ 100 @,@ 000 ) , although bequests were also made to the American Academy of Arts and Sciences and the Massachusetts Historical Society . The Waltham estate passed through several hands and was subdivided over time . The mansion was saved from demolition by the Gore Place Society ( established for the purpose of preserving it ) , which now operates it as a museum . It was declared a National Historic Landmark in 1970 . + + + = Nero = + + Nero ( / ˈnɪəroʊ / ; Latin : Nerō Claudius Caesar Augustus Germanicus ; 15 December 37 AD – 9 June 68 AD ) was Roman Emperor from 54 to 68 , and the last in the Julio @-@ Claudian dynasty . Nero was adopted by his grand @-@ uncle Claudius to become his heir and successor , and succeeded to the throne in 54 following Claudius ' death . + Nero focused much of his attention on diplomacy , trade and enhancing the cultural life of the empire , but according to the historian Tacitus he was viewed by the Roman people as compulsive and corrupt . He ordered theatres built and promoted athletic games . During his reign , the redoubtable general Corbulo conducted a successful war and negotiated peace with the Parthian Empire . His general Suetonius Paulinus crushed a revolt in Britain . Nero annexed the Bosporan Kingdom to the empire and began the First Jewish – Roman War . + In 64 AD , most of Rome was destroyed in the Great Fire of Rome , which many Romans believed Nero himself had started in order to clear land for his planned palatial complex , the Domus Aurea . In 68 , the rebellion of Vindex in Gaul and later the acclamation of Galba in Hispania drove Nero from the throne . Facing a false report of being denounced as a public enemy who was to be executed , he committed suicide on 9 June 68 ( the first Roman emperor to do so ) . His death ended the Julio @-@ Claudian dynasty , sparking a brief period of civil wars known as the Year of the Four Emperors . Nero 's rule is often associated with tyranny and extravagance . He is known for many executions , including that of his mother , and the probable murder by poison of his stepbrother Britannicus . + Nero was rumored to have had captured Christians dipped in oil and set on fire in his garden at night as a source of light . This view is based on the writings of Tacitus , Suetonius and Cassius Dio , the main surviving sources for Nero 's reign , but a few surviving sources paint Nero in a more favourable light . Some sources , including some mentioned above , portray him as an emperor who was popular with the common Roman people , especially in the East . Some modern historians question the reliability of ancient sources when reporting on Nero 's tyrannical acts . + + = = Early life = = + + + = = = Family = = = + + Lucius Domitius Ahenobarbus , Nero , was born on 15 December 37 in Antium ( modern Anzio and Nettuno ) , near Rome . He was the only son of Gnaeus Domitius Ahenobarbus and Agrippina the Younger , sister of Emperor Caligula . + Nero 's father , Gnaeus , was the son of Lucius Domitius Ahenobarbus ( consul 16 BC ) and Antonia Major . Gnaeus was thus the grandson of Gnaeus Domitius Ahenobarbus ( consul 32 BC ) and probably Aemilia Lepida on his father 's side , and the grandson of Mark Antony and Octavia Minor on his mother 's side . Thus , Nero had as his paternal grandmother Antonia Major , and also claimed more remote descent from Antonia Minor as a great @-@ grandson — later grandson after Claudius adopted him . + Through Octavia , Nero was the great @-@ nephew of Caesar Augustus . Nero 's father had been employed as a praetor and was a member of Caligula 's staff when the latter travelled to the East ( some apparently think Suetonius refers to Augustus 's adopted son Gaius Caesar here , but this is not likely ) . + Nero 's father was described by Suetonius as a murderer and a cheat who was charged by Emperor Tiberius with treason , adultery and incest . Tiberius died , allowing him to escape these charges . Nero 's father died of edema ( " dropsy " ) in 39 when Nero was two . + Nero 's mother was Agrippina the Younger , a great @-@ granddaughter of Caesar Augustus and his wife Scribonia through their daughter Julia the Elder and her husband Marcus Vipsanius Agrippa . Agrippina 's father , Germanicus , was a grandson of Augustus 's wife , Livia , on one side and to Mark Antony and Octavia on the other . Germanicus ' mother Antonia Minor , was a daughter of Octavia Minor and Mark Antony . Octavia was Augustus ' elder sister . Germanicus was also the adopted son of Tiberius . Agrippina poisoned her second husband Passienus Crispus , so many ancient historians also accuse her of murdering her third husband , the emperor Claudius . + + = = = Ancestry and family = = = + + + = = = Rise to power = = = + + Nero was not expected to become Emperor because his maternal uncle , Caligula , had begun his reign at the age of 24 with enough time to produce his own heir . Nero 's mother , Agrippina , lost favour with Caligula and was exiled in 39 after her husband 's death . Caligula seized Nero 's inheritance and sent him to be brought up by his less wealthy aunt , Domitia Lepida , who was the mother of Valeria Messalina , Claudius 's third wife . Caligula , his wife Caesonia and their infant daughter Julia Drusilla were murdered on 24 January 41 . These events led Claudius , Caligula 's uncle , to become emperor . Claudius allowed Agrippina to return from exile . + Claudius had married twice before marrying Valeria Messalina . His previous marriages produced three children including a son , Drusus , who died at a young age . He had two children with Messalina – Claudia Octavia ( born 40 ) and Britannicus ( born 41 ) . Messalina was executed by Claudius in the year 48 . + In 49 AD , Claudius married a fourth time , to Nero 's mother Agrippina , despite her being his niece . To aid Claudius politically , young Nero was adopted in 50 and took the name Nero Claudius Caesar Drusus Germanicus ( see adoption in Rome ) . Nero was older than his stepbrother Britannicus , and thus became heir to the throne . + Nero was proclaimed an adult in 51 at the age of 14 . He was appointed proconsul , entered and first addressed the Senate , made joint public appearances with Claudius , and was featured in coinage . In 53 , he married his stepsister Claudia Octavia . + + = = Emperor ( 54 – 68 AD ) = = + + + = = = Early rule = = = + + Claudius died in 54 and Nero , taking the name Nero Claudius Caesar Augustus Germanicus , was established as Emperor . Though accounts vary , many ancient historians state Agrippina poisoned Claudius . According to Pliny the Elder , she used poison mushrooms . It is not known how much Nero knew or if he was even involved in the death of Claudius . + Suetonius wrote " ... for even if he was not the instigator of the emperor 's death , he was at least privy to it , as he openly admitted ; for he used afterwards to laud mushrooms , the vehicle in which the poison was administered to Claudius , as " the food of the gods , " as the Greek proverb has it . At any rate , after Claudius ' death he vented on him every kind of insult , in act and word , charging him now with folly and now with cruelty ; for it was a favourite joke of his to say that Claudius had ceased " to play the fool " among mortals , lengthening the first syllable of the word morari , and he disregarded many of his decrees and acts as the work of a madman and a dotard . Finally , he neglected to enclose the place where his body was burned except with a low and mean wall . " + According to Suetonius ; Nero became Emperor at the age of 17 when the news of Claudius ' death was made known , making him the youngest emperor at that time Although , what Suetonius may have meant is that he was in his seventeenth year as his date of birth , also listed by Suetonius , would have made him 16 at the time of Claudius ' death . Tacitus , in book XIII of his Annals , describes Nero as being ' scarcely out of his boyhood ' at the time he became emperor . Ancient historians describe Nero 's early reign as being strongly influenced by his mother , Agrippina , his tutor Lucius Annaeus Seneca , and the Praetorian Prefect Sextus Afranius Burrus , especially in the first year . Other tutors were less often mentioned , such as Alexander of Aegae . + Very early in Nero 's rule , problems arose from competition for influence between Agrippina and Nero 's two main advisers , Seneca and Burrus . Agrippina also attempted to influence the young Nero . Agrippina also is mentioned by ancient sources as " scheming for her son ( Nero ) " . This scheming continued as is evidenced by the coin of the both of them . It is extremely unusual to see a women 's face on a coin in the ancient world . It is because of this position of power Agrippina felt jealous as Seneca in particular rose up in Nero 's court as he offered the advice Nero wanted to hear unlike his mother . + In 54 , Agrippina tried to sit down next to Nero while he met with an Armenian envoy , but Seneca stopped her and prevented a scandalous scene ( as it was unimaginable at that time for a woman to be in the same room as men doing official business ) . Nero 's friends also mistrusted Agrippina and told Nero to beware of his mother . + Nero was reportedly unsatisfied with his marriage to Octavia and entered into an affair with Claudia Acte , a former slave . In 55 , Agrippina attempted to intervene in favor of Octavia and demanded that her son dismiss Acte . Nero , with the support of Seneca , resisted the intervention of his mother in his personal affairs . + With Agrippina 's influence over her son severed , she reportedly began pushing for Britannicus , Nero 's stepbrother , to become emperor . Nearly fourteen @-@ year @-@ old Britannicus , heir @-@ designate prior to Nero 's adoption , was still legally a minor , but was approaching legal adulthood . According to Tacitus , Agrippina hoped that with her support , Britannicus , being the blood son of Claudius , would be seen as the true heir to the throne by the state over Nero . However , the youth died suddenly and suspiciously on 12 February 55 , the very day before his proclamation as an adult had been set . + Nero claimed that Britannicus died from an epileptic seizure , but ancient historians all claim Britannicus ' death came from Nero 's poisoning him . Supposedly , he enlisted the services of Locusta , a woman who specialized in the manufacture of poisons . She devised a mixture to kill Britannicus , but after testing it unsuccessfully on a slave , Nero angrily threatened to have her put to death if she did not come up with something usable . Locusta then devised a new concoction that she promised would " kill swifter than a viper . " + Her promise was fulfilled after Britannicus consumed it at a dinner party from water used to cool his wine , which had already been tasted , and succumbed within minutes . After the death of Britannicus , Agrippina was accused of slandering Octavia and Nero ordered her out of the imperial residence . + + = = = Matricide and consolidation of power = = = + + Over time , Nero became progressively more powerful , freeing himself of his advisers and eliminating rivals to the throne . In 55 , he removed Marcus Antonius Pallas , an ally of Agrippina , from his position in the treasury . Pallas , along with Burrus , was accused of conspiring against the Emperor to bring Faustus Sulla to the throne . Seneca was accused of having relations with Agrippina and embezzlement . Seneca succeeded in having himself , Pallas and Burrus acquitted . According to Cassius Dio , at this time , Seneca and Burrus reduced their role in governing from careful management to mere moderation of Nero . + In 58 , Nero became romantically involved with Poppaea Sabina , the wife of his friend and future emperor Otho . Reportedly because a marriage to Poppaea and a divorce from Octavia did not seem politically feasible with Agrippina alive , Nero ordered the murder of his mother in 59 . A number of modern historians find this an unlikely motive as Nero did not marry Poppaea until 62 . + Additionally , according to Suetonius , Poppaea did not divorce her husband until after Agrippina 's death , making it unlikely that the already married Poppaea would be pressing Nero for marriage . Some modern historians theorize that Nero 's execution of Agrippina was prompted by her plotting to set Rubellius Plautus on the throne . According to Suetonius , Nero tried to kill his mother through a shipwreck planned by his freedman tutor Anicetus . Instead , it took the life of Agrippina 's friend , Acerronia Polla . When Agrippina survived , he had her executed by Anicetus and framed it as a suicide . The incident is also recorded by Tacitus . + In 62 , Nero 's adviser , Burrus , died . Additionally , Seneca was again faced with embezzlement charges . Seneca asked Nero for permission to retire from public affairs . Nero divorced and banished Octavia on grounds of infertility , leaving him free to marry the pregnant Poppaea . After public protests , Nero was forced to allow Octavia to return from exile , but she was executed shortly after her return . + Nero also was reported to have kicked Poppaea to death in 65 before she could have his second child . However , modern historians , noting Suetonius , Tacitus and Cassius Dio 's possible bias against Nero and the likelihood that they did not have eyewitness accounts of private events , postulate that Poppaea may have died because of complications of miscarriage or childbirth . + Accusations of treason being plotted against Nero and the Senate first appeared in 62 . The Senate ruled that Antistius , a praetor , should be put to death for speaking ill of Nero at a party . Later , Nero ordered the exile of Fabricius Veiento who slandered the Senate in a book . Tacitus writes that the roots of the conspiracy led by Gaius Calpurnius Piso began in this year . To consolidate power , Nero executed a number of people in 62 and 63 including his rivals Pallas , Rubellius Plautus and Faustus Sulla . According to Suetonius , Nero " showed neither discrimination nor moderation in putting to death whomsoever he pleased " during this period . + Nero 's consolidation of power also included a slow usurping of authority from the Senate . In 54 , Nero promised to give the Senate powers equivalent to those under Republican rule . By 65 , senators complained that they had no power left and this led to the Pisonian conspiracy . + + = = = Other relationships = = = + + When Nero 's wife Poppaea Sabina died in 65 , Nero went into deep mourning . Her body was not cremated , it was stuffed with spices , embalmed and put in the Mausoleum of Augustus . She was given a state funeral . Nero praised her during the funeral eulogy and gave her divine honors . It is said that Nero " burned ten years ' worth of Arabia 's incense production at her funeral . + In the beginning of 66 , he married Statilia Messalina . She was already married when she became Nero 's mistress in 65 AD , with Statilia 's husband being driven to suicide in 66 , so Nero could marry Statilia . She was one of the few of Nero 's courtiers who survived the fall of his reign . + In 67 , Nero ordered a young freedman , Sporus , to be castrated and then married him . According to Dion Cassius , Sporus bore an uncanny resemblance to Sabina , and Nero even called him by his dead wife 's name . + + = = = Administrative policies = = = + + Over the course of his reign , Nero often made rulings that pleased the lower class . Nero was criticized as being obsessed with personal popularity . + Nero began his reign in 54 by promising the Senate more autonomy . In this first year , he forbade others to refer to him with regard to enactments , for which he was praised by the Senate . Nero was known for spending his time visiting brothels and taverns during this period . + In 55 , Nero began taking on a more active role as an administrator . He was consul four times between 55 and 60 . During this period , some ancient historians speak fairly well of Nero and contrast it with his later rule . + Under Nero , restrictions were put on the amount of bail and fines . Also , fees for lawyers were limited . There was a discussion in the Senate on the misconduct of the freedmen class , and a strong demand was made that patrons should have the right of revoking freedom . Nero supported the freedmen and ruled that patrons had no such right . + The Senate tried to pass a law in which the crimes of one slave applied to all slaves within a household . Despite riots from the people , Nero supported the Senate on their measure , and deployed troops to organise the execution of 400 slaves affected by the law . However , he vetoed strong measures against the freedmen affected by the case . + After tax collectors were accused of being too harsh to the poor , Nero transferred collection authority to lower commissioners . Nero banned any magistrate or procurator from exhibiting public entertainment for fear that the venue was being used as a method to sway the populace . Additionally , there were many impeachments and removals of government officials along with arrests for extortion and corruption . + When further complaints arose that the poor were being overly taxed , Nero attempted to repeal all indirect taxes . The Senate convinced him this action would bankrupt the public treasury . As a compromise , taxes were cut from 4 @.@ 5 % to 2 @.@ 5 % . Additionally , secret government tax records were ordered to become public . To lower the cost of food imports , merchant ships were declared tax @-@ exempt . + In imitation of the Greeks , Nero built a number of gymnasiums and theatres . Enormous gladiatorial shows were also held . Nero also established the quinquennial Neronia . The festival included games , poetry , and theater . Historians indicate that there was a belief that theatre led to immorality . Others considered that to have performers dressed in Greek clothing was old fashioned . Some questioned the large public expenditure on entertainment . + In 64 , Rome burned . Nero enacted a public relief effort as well as significant reconstruction . A number of other major construction projects occurred in Nero 's late reign . Nero had the marshes of Ostia filled with rubble from the fire . He erected the large Domus Aurea . In 67 , Nero attempted to have a canal dug at the Isthmus of Corinth . Ancient historians state that these projects and others exacerbated the drain on the State 's budget . + The cost to rebuild Rome was immense , requiring funds the state treasury did not have . Nero devalued the Roman currency for the first time in the Empire 's history . He reduced the weight of the denarius from 84 per Roman pound to 96 ( 3 @.@ 85 grams to 3 @.@ 35 grams ) . He also reduced the silver purity from 99 @.@ 5 % to 93 @.@ 5 % — the silver weight dropping from 3 @.@ 83 grams to 3 @.@ 4 grams . Furthermore , Nero reduced the weight of the aureus from 40 per Roman pound to 45 ( 8 grams to 7 @.@ 2 grams ) . + Between 62 and 67 , according to Plinius the Elder and Seneca , Nero promoted an expedition to discover the sources of the Nile River . It was the first exploration of equatorial Africa from Europe in history . However , Nero 's expedition up the Nile failed upon reaching the impenetrable Sudd of present @-@ day South Sudan . + The economic policy of Nero is a point of debate among scholars . According to ancient historians , Nero 's construction projects were overly extravagant and the large number of expenditures under Nero left Italy " thoroughly exhausted by contributions of money " with " the provinces ruined . " Modern historians , though , note that the period was riddled with deflation and that it is likely that Nero 's spending came in the form of public works projects and charity intended to ease economic troubles . + + = = = Great Fire of Rome ( 64 AD ) = = = + + The Great Fire of Rome erupted on the night of 18 July to 19 July 64 . The fire started at the southeastern end of the Circus Maximus in shops selling flammable goods . + The extent of the fire is uncertain . According to Tacitus , who was nine at the time of the fire , it spread quickly and burned for over five days . It destroyed three of fourteen Roman districts and severely damaged seven . The only other historian who lived through the period and mentioned the fire is Pliny the Elder , who wrote about it in passing . Other historians who lived through the period ( including Josephus , Dio Chrysostom , Plutarch and Epictetus ) make no mention of it in what remains of their work . + It is uncertain who or what actually caused the fire — whether accident or arson . Suetonius and Cassius Dio favor Nero as the arsonist , so he could build a palatial complex . Tacitus mentions that Christians confessed to the crime , but it is not known whether these confessions were induced by torture . However , accidental fires were common in ancient Rome . In fact , Rome suffered other large fires in 69 and in 80 . + It was said by Suetonius and Cassius Dio that Nero sang the " Sack of Ilium " in stage costume while the city burned . Popular legend claims that Nero played the fiddle at the time of the fire , an anachronism based merely on the concept of the lyre , a stringed instrument associated with Nero and his performances . ( The fiddle was not invented until the 10th century . ) Tacitus 's account , however , has Nero in Antium at the time of the fire . Tacitus also said that Nero playing his lyre and singing while the city burned was only rumor . + According to Tacitus , upon hearing news of the fire , Nero returned to Rome to organize a relief effort , which he paid for from his own funds . Nero 's contributions to the relief extended to personally taking part in the search for and rescue of victims of the blaze , spending days searching the debris without even his bodyguards . After the fire , Nero opened his palaces to provide shelter for the homeless , and arranged for food supplies to be delivered in order to prevent starvation among the survivors . + In the wake of the fire , he made a new urban development plan . Houses after the fire were spaced out , built in brick , and faced by porticos on wide roads . Nero also built a new palace complex known as the Domus Aurea in an area cleared by the fire . This included lush artificial landscapes and a 30 @-@ meter @-@ tall statue of himself , the Colossus of Nero . The size of this complex is debated ( from 100 to 300 acres ) . To find the necessary funds for the reconstruction , tributes were imposed on the provinces of the empire . + Tacitus , in one of the earliest non @-@ Christian references to the origins of Christianity , notes that the population searched for a scapegoat and rumors held Nero responsible . To deflect blame , Nero targeted Christians . He ordered Christians to be thrown to dogs , while others were crucified and burned . + + = = = Public performances = = = + + Nero enjoyed driving a one @-@ horse chariot , singing to the lyre and poetry . He even composed songs that were performed by other entertainers throughout the empire . At first , Nero only performed for a private audience . + In 64 AD . , Nero began singing in public in Neapolis in order to improve his popularity . He also sang at the second quinquennial Neronia in 65 . It was said that Nero craved the attention , but historians also write that Nero was encouraged to sing and perform in public by the Senate , his inner circle and the people . Ancient historians strongly criticize his choice to perform , calling it shameful . + Nero was persuaded to participate in the Olympic Games of 67 in order to improve relations with Greece and display Roman dominance . As a competitor , Nero raced a ten @-@ horse chariot and nearly died after being thrown from it . He also performed as an actor and a singer . Though Nero faltered in his racing ( in one case , dropping out entirely before the end ) and acting competitions , he won these crowns nevertheless and paraded them when he returned to Rome . The victories are attributed to Nero bribing the judges and his status as emperor . + + = = = War and peace with Parthia = = = + + Shortly after Nero 's accession to the throne in 54 , the Roman vassal kingdom of Armenia overthrew their Iberian prince Rhadamistus and he was replaced with the Parthian prince Tiridates . This was seen as a Parthian invasion of Roman territory . There was concern in Rome over how the young Emperor would handle the situation . Nero reacted by immediately sending the military to the region under the command of Gnaeus Domitius Corbulo . The Parthians temporarily relinquished control of Armenia to Rome . + The peace did not last and full @-@ scale war broke out in 58 . The Parthian king Vologases I refused to remove his brother Tiridates from Armenia . The Parthians began a full @-@ scale invasion of the Armenian kingdom . Commander Corbulo responded and repelled most of the Parthian army that same year . Tiridates retreated and Rome again controlled most of Armenia . + Nero was acclaimed in public for this initial victory . Tigranes , a Cappadocian noble raised in Rome , was installed by Nero as the new ruler of Armenia . Corbulo was appointed governor of Syria as a reward . + In 62 , Tigranes invaded the Parthian province of Adiabene . Again , Rome and Parthia were at war and this continued until 63 . Parthia began building up for a strike against the Roman province of Syria . Corbulo tried to convince Nero to continue the war , but Nero opted for a peace deal instead . There was anxiety in Rome about eastern grain supplies and a budget deficit . + The result was a deal where Tiridates again became the Armenian king , but was crowned in Rome by Emperor Nero . In the future , the king of Armenia was to be a Parthian prince , but his appointment required approval from the Romans . Tiridates was forced to come to Rome and partake in ceremonies meant to display Roman dominance . + This peace deal of 63 was a considerable victory for Nero politically . Nero became very popular in the eastern provinces of Rome and with the Parthians as well . The peace between Parthia and Rome lasted 50 years until Emperor Trajan of Rome invaded Armenia in 114 . + + = = = Other major power struggles and rebellions = = = + + The war with Parthia was not Nero 's only major war but he was both criticized and praised for an aversion to battle . Like many emperors , Nero faced a number of rebellions and power struggles within the empire . + British Revolt of 60 – 61 ( Boudica 's Uprising ) + In 60 , a major rebellion broke out in the province of Britannia . While the governor Gaius Suetonius Paulinus and his troops were busy capturing the island of Mona ( Anglesey ) from the druids , the tribes of the southeast staged a revolt led by queen Boudica of the Iceni . Boudica and her troops destroyed three cities before the army of Paulinus could return , receive reinforcements , and quell the rebellion in 61 . Fearing Paulinus himself would provoke further rebellion , Nero replaced him with the more passive Publius Petronius Turpilianus . + The Pisonian Conspiracy of 65 + In 65 , Gaius Calpurnius Piso , a Roman statesman , organized a conspiracy against Nero with the help of Subrius Flavus and Sulpicius Asper , a tribune and a centurion of the Praetorian Guard . According to Tacitus , many conspirators wished to " rescue the state " from the emperor and restore the Republic . The freedman Milichus discovered the conspiracy and reported it to Nero 's secretary , Epaphroditos . As a result , the conspiracy failed and its members were executed including Lucan , the poet . Nero 's previous advisor , Seneca was ordered to commit suicide after admitting he discussed the plot with the conspirators . + The First Jewish War of 66 – 70 + In 66 , there was a Jewish revolt in Judea stemming from Greek and Jewish religious tension . In 67 , Nero dispatched Vespasian to restore order . This revolt was eventually put down in 70 , after Nero 's death . This revolt is famous for Romans breaching the walls of Jerusalem and destroying the Second Temple of Jerusalem . + + = = = The revolt of Vindex and Galba and the death of Nero = = = + + In March 68 , Gaius Julius Vindex , the governor of Gallia Lugdunensis , rebelled against Nero 's tax policies . Lucius Verginius Rufus , the governor of Germania Superior , was ordered to put down Vindex 's rebellion . In an attempt to gain support from outside his own province , Vindex called upon Servius Sulpicius Galba , the governor of Hispania Tarraconensis , to join the rebellion and further , to declare himself emperor in opposition to Nero . + At the Battle of Vesontio in May 68 , Verginius ' forces easily defeated those of Vindex and the latter committed suicide . However , after putting down this one rebel , Verginius ' legions attempted to proclaim their own commander as Emperor . Verginius refused to act against Nero , but the discontent of the legions of Germany and the continued opposition of Galba in Spain did not bode well for him . + While Nero had retained some control of the situation , support for Galba increased despite his being officially declared a public enemy . The prefect of the Praetorian Guard , Gaius Nymphidius Sabinus , also abandoned his allegiance to the Emperor and came out in support for Galba . + In response , Nero fled Rome with the intention of going to the port of Ostia and , from there , to take a fleet to one of the still @-@ loyal eastern provinces . According to Suetonius , Nero abandoned the idea when some army officers openly refused to obey his commands , responding with a line from Vergil 's Aeneid : " Is it so dreadful a thing then to die ? " Nero then toyed with the idea of fleeing to Parthia , throwing himself upon the mercy of Galba , or to appeal to the people and beg them to pardon him for his past offences " and if he could not soften their hearts , to entreat them at least to allow him the prefecture of Egypt " . Suetonius reports that the text of this speech was later found in Nero 's writing desk , but that he dared not give it from fear of being torn to pieces before he could reach the Forum . + Nero returned to Rome and spent the evening in the palace . After sleeping , he awoke at about midnight to find the palace guard had left . Dispatching messages to his friends ' palace chambers for them to come , he received no answers . Upon going to their chambers personally , he found them all abandoned . When he called for a gladiator or anyone else adept with a sword to kill him , no one appeared . He cried , " Have I neither friend nor foe ? " and ran out as if to throw himself into the Tiber . + Returning , Nero sought for some place where he could hide and collect his thoughts . An imperial freedman , Phaon , offered his villa , located 4 miles outside the city . Travelling in disguise , Nero and four loyal freedmen , Epaphroditos , Phaon , Neophytus , and Sporus , reached the villa , where Nero ordered them to dig a grave for him . + At this time , a courier arrived with a report that the Senate had declared Nero a public enemy and that it was their intention to execute him by beating him to death and that armed men had been sent to apprehend him for the act to take place in the Forum . The Senate actually was still reluctant and deliberating on the right course of action as Nero was the last member of the Julio @-@ Claudian Family . Indeed , most of the senators had served the imperial family all their lives and felt a sense of loyalty to the deified bloodline , if not to Nero himself . The men actually had the goal of returning Nero back to the Senate , where the Senate hoped to work out a compromise with the rebelling governors that would preserve Nero 's life , so that at least a future heir to the dynasty could be produced . + Nero , however , did not know this , and at the news brought by the courier , he prepared himself for suicide , pacing up and down muttering " Qualis artifex pereo " which translates to English as " What an artist dies in me . " Losing his nerve , he first begged for one of his companions to set an example by first killing himself . At last , the sound of approaching horsemen drove Nero to face the end . However , he still could not bring himself to take his own life but instead he forced his private secretary , Epaphroditos , to perform the task . + When one of the horsemen entered , upon his seeing Nero all but dead he attempted to stop the bleeding in vain . Nero 's final words were " Too late ! This is fidelity ! " He died on 9 June 68 , the anniversary of the death of Octavia , and was buried in the Mausoleum of the Domitii Ahenobarbi , in what is now the Villa Borghese ( Pincian Hill ) area of Rome . + With his death , the Julio @-@ Claudian dynasty ended . The Senate , when news of his death reached Rome , posthumously declared Nero a public enemy to appease the coming Galba ( as the Senate had initially declared Galba as a public enemy ) and proclaimed Galba the new emperor . Chaos would ensue in the year of the Four Emperors . + + = = = Post mortem = = = + + According to Suetonius and Cassius Dio , the people of Rome celebrated the death of Nero . Tacitus , though , describes a more complicated political environment . Tacitus mentions that Nero 's death was welcomed by Senators , nobility and the upper class . The lower @-@ class , slaves , frequenters of the arena and the theater , and " those who were supported by the famous excesses of Nero " , on the other hand , were upset with the news . Members of the military were said to have mixed feelings , as they had allegiance to Nero , but were bribed to overthrow him . + Eastern sources , namely Philostratus II and Apollonius of Tyana , mention that Nero 's death was mourned as he " restored the liberties of Hellas with a wisdom and moderation quite alien to his character " and that he " held our liberties in his hand and respected them . " + Modern scholarship generally holds that , while the Senate and more well @-@ off individuals welcomed Nero 's death , the general populace was " loyal to the end and beyond , for Otho and Vitellius both thought it worthwhile to appeal to their nostalgia . " + Nero 's name was erased from some monuments , in what Edward Champlin regards as an " outburst of private zeal " . Many portraits of Nero were reworked to represent other figures ; according to Eric R. Varner , over fifty such images survive . This reworking of images is often explained as part of the way in which the memory of disgraced emperors was condemned posthumously ( see damnatio memoriae ) . Champlin , however , doubts that the practice is necessarily negative and notes that some continued to create images of Nero long after his death . + The civil war during the year of the Four Emperors was described by ancient historians as a troubling period . According to Tacitus , this instability was rooted in the fact that emperors could no longer rely on the perceived legitimacy of the imperial bloodline , as Nero and those before him could . Galba began his short reign with the execution of many allies of Nero and possible future enemies . One such notable enemy included Nymphidius Sabinus , who claimed to be the son of Emperor Caligula . + Otho overthrew Galba . Otho was said to be liked by many soldiers because he had been a friend of Nero 's and resembled him somewhat in temperament . It was said that the common Roman hailed Otho as Nero himself . Otho used " Nero " as a surname and reerected many statues to Nero . Vitellius overthrew Otho . Vitellius began his reign with a large funeral for Nero complete with songs written by Nero . + After Nero 's suicide in 68 , there was a widespread belief , especially in the eastern provinces , that he was not dead and somehow would return . This belief came to be known as the Nero Redivivus Legend . + The legend of Nero 's return lasted for hundreds of years after Nero 's death . Augustine of Hippo wrote of the legend as a popular belief in 422 . + At least three Nero imposters emerged leading rebellions . The first , who sang and played the cithara or lyre and whose face was similar to that of the dead emperor , appeared in 69 during the reign of Vitellius . After persuading some to recognize him , he was captured and executed . Sometime during the reign of Titus ( 79 – 81 ) , another impostor appeared in Asia and sang to the accompaniment of the lyre and looked like Nero but he , too , was killed . Twenty years after Nero 's death , during the reign of Domitian , there was a third pretender . He was supported by the Parthians , who only reluctantly gave him up , and the matter almost came to war . + + = = Physical appearance = = + + In his book The Lives of the Twelve Caesars , Suetonius describes Nero as " about the average height , his body marked with spots and malodorous , his hair light blonde , his features regular rather than attractive , his eyes blue and somewhat weak , his neck over thick , his belly prominent , and his legs very slender . " + + = = Historiography = = + + The history of Nero 's reign is problematic in that no historical sources survived that were contemporary with Nero . These first histories at one time did exist and were described as biased and fantastical , either overly critical or praising of Nero . The original sources were also said to contradict on a number of events . Nonetheless , these lost primary sources were the basis of surviving secondary and tertiary histories on Nero written by the next generations of historians . A few of the contemporary historians are known by name . Fabius Rusticus , Cluvius Rufus and Pliny the Elder all wrote condemning histories on Nero that are now lost . There were also pro @-@ Nero histories , but it is unknown who wrote them or for what deeds Nero was praised . + The bulk of what is known of Nero comes from Tacitus , Suetonius and Cassius Dio , who were all of the senatorial class . Tacitus and Suetonius wrote their histories on Nero over fifty years after his death , while Cassius Dio wrote his history over 150 years after Nero 's death . These sources contradict on a number of events in Nero 's life including the death of Claudius , the death of Agrippina , and the Roman fire of 64 , but they are consistent in their condemnation of Nero . + A handful of other sources also add a limited and varying perspective on Nero . Few surviving sources paint Nero in a favourable light . Some sources , though , portray him as a competent emperor who was popular with the Roman people , especially in the east . + Cassius Dio + Cassius Dio ( c . 155 – 229 ) was the son of Cassius Apronianus , a Roman senator . He passed the greater part of his life in public service . He was a senator under Commodus and governor of Smyrna after the death of Septimius Severus ; and afterwards suffect consul around 205 , and also proconsul in Africa and Pannonia . + Books 61 – 63 of Dio 's Roman History describe the reign of Nero . Only fragments of these books remain and what does remain was abridged and altered by John Xiphilinus , an 11th @-@ century monk . + Dio Chrysostom + Dio Chrysostom ( c . 40 – 120 ) , a Greek philosopher and historian , wrote the Roman people were very happy with Nero and would have allowed him to rule indefinitely . They longed for his rule once he was gone and embraced imposters when they appeared : + Indeed the truth about this has not come out even yet ; for so far as the rest of his subjects were concerned , there was nothing to prevent his continuing to be Emperor for all time , seeing that even now everybody wishes he were still alive . And the great majority do believe that he still is , although in a certain sense he has died not once but often along with those who had been firmly convinced that he was still alive . + Epictetus + Epictetus ( c . 55 – 135 ) was the slave to Nero 's scribe Epaphroditos . He makes a few passing negative comments on Nero 's character in his work , but makes no remarks on the nature of his rule . He describes Nero as a spoiled , angry and unhappy man . + Josephus + The historian Josephus ( c . 37 – 100 ) , while calling Nero a tyrant , was also the first to mention bias against Nero . Of other historians , he said : + But I omit any further discourse about these affairs ; for there have been a great many who have composed the history of Nero ; some of which have departed from the truth of facts out of favour , as having received benefits from him ; while others , out of hatred to him , and the great ill @-@ will which they bore him , have so impudently raved against him with their lies , that they justly deserve to be condemned . Nor do I wonder at such as have told lies of Nero , since they have not in their writings preserved the truth of history as to those facts that were earlier than his time , even when the actors could have no way incurred their hatred , since those writers lived a long time after them . + Lucan + Though more of a poet than historian , Lucanus ( c . 39 – 65 ) has one of the kindest accounts of Nero 's rule . He writes of peace and prosperity under Nero in contrast to previous war and strife . Ironically , he was later involved in a conspiracy to overthrow Nero and was executed . + Philostratus + Philostratus II " the Athenian " ( c . 172 – 250 ) spoke of Nero in the Life of Apollonius Tyana ( Books 4 – 5 ) . Though he has a generally bad or dim view of Nero , he speaks of others ' positive reception of Nero in the East . + Pliny the Elder + The history of Nero by Pliny the Elder ( c . 24 – 79 ) did not survive . Still , there are several references to Nero in Pliny 's Natural Histories . Pliny has one of the worst opinions of Nero and calls him an " enemy of mankind . " + Plutarch + Plutarch ( c . 46 – 127 ) mentions Nero indirectly in his account of the Life of Galba and the Life of Otho . Nero is portrayed as a tyrant , but those that replace him are not described as better . + Seneca the Younger + It is not surprising that Seneca ( c . 4 BC – 65 ) , Nero 's teacher and advisor , writes very well of Nero . + Suetonius + Suetonius ( c . 69 – 130 ) was a member of the equestrian order , and he was the head of the department of the imperial correspondence . While in this position , Suetonius started writing biographies of the emperors , accentuating the anecdotal and sensational aspects . + Tacitus + The Annals by Tacitus ( c . 56 – 117 ) is the most detailed and comprehensive history on the rule of Nero , despite being incomplete after the year 66 . Tacitus described the rule of the Julio @-@ Claudian emperors as generally unjust . He also thought that existing writing on them was unbalanced : + The histories of Tiberius , Caius , Claudius and Nero , while they were in power , were falsified through terror , and after their death were written under the irritation of a recent hatred . + Tacitus was the son of a procurator , who married into the elite family of Agricola . He entered his political life as a senator after Nero 's death and , by Tacitus ' own admission , owed much to Nero 's rivals . Realising that this bias may be apparent to others , Tacitus protests that his writing is true . + Girolamo Cardano + In 1562 Girolamo Cardano published in Basel his Encomium Neronis , which was one of the first historical references of the Modern era to portray Nero in a positive light . + + = = Nero in Jewish and Christian tradition = = + + + = = = Jewish tradition = = = + + At the end of 66 , conflict broke out between Greeks and Jews in Jerusalem and Caesarea . According to the Talmud , Nero went to Jerusalem and shot arrows in all four directions . All the arrows landed in the city . He then asked a passing child to repeat the verse he had learned that day . The child responded , " I will lay my vengeance upon Edom by the hand of my people Israel " ( Ez . 25 @,@ 14 ) . Nero became terrified , believing that God wanted the Temple in Jerusalem to be destroyed , but would punish the one to carry it out . Nero said , " He desires to lay waste His House and to lay the blame on me , " whereupon he fled and converted to Judaism to avoid such retribution . Vespasian was then dispatched to put down the rebellion . + The Talmud adds that the sage Reb Meir Baal HaNess , Rabbi Meir or Rabbi Meir Baal HaNes ( Rabbi Meir the miracle maker ) was a Jewish sage who lived in the time of the Mishna a prominent supporter of the Bar Kokhba rebellion against Roman rule . He was considered one of the greatest of the Tannaim of the third generation ( 139 @-@ 163 ) . According to the Talmud , his father was a descendant of the Roman Emperor Nero who had converted to Judaism . His wife Bruriah is one of the few women cited in the Gemara . He is the third most frequently mentioned sage in the Mishnah . + Roman and Greek sources nowhere report Nero 's alleged trip to Jerusalem or his alleged conversion to Judaism . There is also no record of Nero having any offspring who survived infancy : his only recorded child , Claudia Augusta , died aged 4 months . + + = = = Christian tradition = = = + + Non @-@ Christian historian Tacitus describes Nero extensively torturing and executing Christians after the fire of 64 . Suetonius also mentions Nero punishing Christians , though he does so because they are " given to a new and mischievous superstition " and does not connect it with the fire . + Christian writer Tertullian ( c . 155 – 230 ) was the first to call Nero the first persecutor of Christians . He wrote , " Examine your records . There you will find that Nero was the first that persecuted this doctrine " . Lactantius ( c . 240 – 320 ) also said that Nero " first persecuted the servants of God " . as does Sulpicius Severus . However , Suetonius writes that , " since the Jews constantly made disturbances at the instigation of Chrestus , he [ emperor Claudius ] expelled them from Rome " ( " Iudaeos impulsore Chresto assidue tumultuantis Roma expulit " ) . These expelled " Jews " may have been early Christians , although Suetonius is not explicit . Nor is the Bible explicit , calling Aquila of Pontus and his wife , Priscilla , both expelled from Italy at the time , " Jews " . + + = = = = Martyrdoms of Peter and Paul = = = = + + The first text to suggest that Nero ordered the execution of an apostle is a letter by Clement to the Corinthians traditional dated to around 96 A.D. The apocryphal Ascension of Isaiah , a Christian writing from the 2nd century says , " the slayer of his mother , who himself ( even ) this king , will persecute the plant which the Twelve Apostles of the Beloved have planted . Of the Twelve one will be delivered into his hands " was interpreted to mean Nero . + Bishop Eusebius of Caesarea ( c . 275 – 339 ) was the first to write explicitly that Paul was beheaded in Rome during the reign of Nero . He states that Nero 's persecution led to Peter and Paul 's deaths , but that Nero did not give any specific orders . However , several other accounts going back to the 1st century have Paul surviving his two years in Rome and travelling to Hispania , before facing trial in Rome again prior to his death . + Peter is first said to have been crucified upside @-@ down in Rome during Nero 's reign ( but not by Nero ) in the apocryphal Acts of Peter ( c . 200 ) . The account ends with Paul still alive and Nero abiding by God 's command not to persecute any more Christians . + By the 4th century , a number of writers were stating that Nero killed Peter and Paul . + + = = = = The Antichrist = = = = + + The Sibylline Oracles , Book 5 and 8 , written in the 2nd century , speak of Nero returning and bringing destruction . Within Christian communities , these writings , along with others , fueled the belief that Nero would return as the Antichrist . In 310 , Lactantius wrote that Nero " suddenly disappeared , and even the burial place of that noxious wild beast was nowhere to be seen . This has led some persons of extravagant imagination to suppose that , having been conveyed to a distant region , he is still reserved alive ; and to him they apply the Sibylline verses " , Lactantius maintains that it is not right to believe this . + In 422 , Augustine of Hippo wrote about 2 Thessalonians 2 : 1 – 11 , where he believed Paul mentioned the coming of the Antichrist . Though he rejects the theory , Augustine mentions that many Christians believed that Nero was the Antichrist or would return as the Antichrist . He wrote , " so that in saying , ' For the mystery of iniquity doth already work , ' he alluded to Nero , whose deeds already seemed to be as the deeds of Antichrist . " + Some modern biblical scholars such as Delbert Hillers ( Johns Hopkins University ) of the American Schools of Oriental Research and the editors of the Oxford & Harper Collins Study Bibles , contend that the number 666 in the Book of Revelation is a code for Nero , a view that is also supported in Roman Catholic Biblical commentaries . + The concept of Nero as the Antichrist is often a central belief of Preterist eschatology . + + + = Manila = + + Manila ( / məˈnɪl.ə / ) is the capital city of the Philippines , founded on June 24 , 1571 by Spanish conquistador Miguel López de Legazpi . It is one of the oldest cities in the country and was the seat of power for most of the colonial rules of the Philippines . It is situated on the eastern shore of Manila Bay and contains a multitude of landmarks , some of which date back to the 16th century , such as the Spanish colonial era Walled City of Intramuros . + Manila is the second most populous city in the Philippines after the former capital Quezon City with a population of 1 @,@ 780 @,@ 148 in 2015 . Because of its small land area and huge population , Manila is regarded as one of the most densely populated cities in the world with 42 @,@ 857 people per square kilometer . Manila is one of the sixteen cities and a municipality that make up Metro Manila , the National Capital Region of the Philippines . In 2012 , Globalization and World Cities Research Network listed Manila as a global city . + Manila has six representative districts for the lower house of the Philippine Congress . Furthermore , the city is composed of 16 districts , namely : Binondo , Ermita , Intramuros , Malate , Paco , Pandacan , Port Area , Quiapo , Sampaloc , San Andres , San Miguel , San Nicolas , Santa Ana , Santa Cruz , Santa Mesa and Tondo . + The Kingdom of Tondo once ruled in the vicinity of Manila before it briefly became a province of the Hindu Majapahit Empire . During the Brunei invasion of the Philippines , Sultan Bolkiah of Brunei captured Seludong ( a village in modern @-@ day Manila ) and renamed it Maynilà , a Tagalog term referring to the presence of the Nila shrub . Maynila was a vassal state of Brunei , established to overpower Tondo . Maynilà had been Indianized since the sixth century CE and earlier . It had become partly Islamic and Hindu @-@ animist by the 15th century CE . + In 1571 Spanish Conquistadors arrived from Mexico , from across the Pacific , and founded present @-@ day Manila in what today is Intramuros . Spanish missionaries soon Christianized the city and incorporated Tondo under Manila and then built some of the oldest churches in the country , including San Agustin Church . The Conquistadors renamed the area Nuevo Reino de Castilla ( New Kingdom of Castille ) and shortened the name to Manila . + Manila became the center of Spanish activity in the Far East and one end of the Manila @-@ Acapulco Galleon trade route , linking Spanish America with Asia , one of the earliest examples of globalization . Due to the central location in the Pacific sea trade routes , Manila received the moniker of the " Pearl of the Orient " . Spanish rule of Manila and the entire Philippine archipelago lasted for over three centuries , until 1898 . At different times during the long Spanish period there were local revolts , Chinese insurrections , massive pirate attacks , great earthquakes , Dutch raids and invasion attempts , and a British occupation of the city during their unsuccessful attempt to conquer the Philippines . Order was usually quickly restored and the city returned to the business of trade . In the 19th century Manila was one of the most modern cities in Asia . Before the Spanish – American War , Manila saw the rise of the Philippine Revolution . Under the American rule following the Spanish – American War , the United States changed the official language from Spanish to English and made some changes in education , local laws and urban planning . Towards the end of World War II , during the Battle of Manila most of the city was flattened by intensive aerial bombardment by the United States Air Force . As a result , relatively little remains of Manila 's prewar and colonial architecture , although there are ongoing restoration projects , especially within the old walled city , Intramuros . + + = = History = = + + The earliest evidence of human life in and around the area of Manila is the nearby Angono Petroglyphs dated to around 3000 BC . Furthermore , negritos , a class of Australoid peoples , became the aboriginal inhabitants of the Philippines . They were found across Luzon before the Malayo @-@ Polynesians migrated in and assimilated them . + The Kingdom of Tondo flourished during the latter half of the Ming Dynasty as a result of direct trade relations with China . Tondo district was maintained as the traditional capital of the empire , with its rulers as sovereign kings and not mere chieftains , and were addressed variously as panginuan ln Meranau or panginoón in Tagalog ( " lords " ) ; anák banwa ( " son of heaven " ) ; or lakandula ( " lord of the palace " ) , the Emperor of China considered the Lakans ( rulers of ancient Manila ) " 王 " ( Kings ) . + In the 13th century , Manila consisted of a fortified settlement and trading quarter at the shores of the Pasig River , on top of previous older towns . Manila was then settled by the Indianized empire of Majapahit as referenced in the epic eulogy poem Nagarakretagama which inscribed its conquest by Maharaja Hayam Wuruk . Selurong " षेलुरोन ् ग ् " which is a historical name for the city of Manila is listed in Canto 14 alongside Sulot , which is now Sulu , and Kalka . + During the reign of Sultan Bolkiah from 1485 to 1521 , the Bruneian Empire invaded , wanting to take advantage of Tondo 's China trade by attacking its environs and establishing " كوتا سلودوڠ Kota Saludong " ( The Kingdom of Maynila ) . They ruled under and gave yearly tribute to the Sultanate of Brunei as its satellite state . They established a new dynasty under the local leader who accepted Islam and became Rajah Salalila or Tariq Sulayman I. He also established a trading challenge to the already rich House of Lakan Dula in Tondo . Islam was further strengthened by the arrival of Muslim traders from the Arab @-@ Indian area and Southeast Asia . Manila was temporarily besieged by the invasion of Chinese pirate @-@ warlord Limahong ( 1574 ) but was thwarted by the local inhabitants , before it became the seat of the colonial government of Spain . + On June 24 , 1571 , Spanish conquistador Miguel López de Legazpi arrived from New Spain ( now Mexico ) , and then exercised rule of the Spanish city of Manila as a territory of New Spain with the establishment of a city council in what today is the district of Intramuros . López de Legazpi had the local royalty executed or exiled , after the failure of the Tondo Conspiracy ; a plot wherein an alliance between Japanese merchants , Luzon 's Huangs with several Datus and Rajahs plus the Bruneian Empire would band together to execute the Spaniards and their Latin @-@ American mercenaries , and Visayan allies . At the conclusion of which , the victorious Spaniards made Manila the capital of the Spanish East Indies and of the Philippines , which the empire would control for the next three centuries , from 1565 to 1898 . + Manila then became famous during the Manila @-@ Acapulco Galleon trade which lasted for three centuries and brought goods from Europe , Africa and Latin America across the Pacific Islands to Southeast Asia ( Which was already an entrepot for goods coming from India , Indonesia and China ) and trade also flowed vice versa . Silver that was mined in Mexico and Peru were exchanged for Chinese silk , Indian gems , and the spices of the Southeast Asia , some of which even flowed to Europe . Likewise wines and olives grown from Europe and North Africa were transshipped via Mexico towards Manila . + Manila was occupied by British forces for twenty months , from 1762 to 1764 , and used as a base for an unsuccessful attempt to conquer the Philippines during the Seven Years ' War . Eventually , the British withdrew from Manila as per agreements in the 1763 Treaty of Paris . The Chinese were punished for supporting the British invasion , and the small fortress @-@ city of Intramuros , mostly populated by Europeans and Mexicans , kept its cannons pointed at Binondo , the world 's oldest Chinatown . + Mexican Independence in 1821 necessitated direct rule from Spain . Under direct Spanish rule , banking , industry and education flourished more than it had in the previous two centuries . The opening of the Suez Canal in 1869 facilitated direct trade and communications with Spain . + The growing wealth and education attracted indigenous , Chinese , Indians , Latinos , and Europeans from the provinces to Manila , all of whom elected a nascent Filipino citizenship regardless of ethnicity . The developments also facilitated the rise of an illustrado class which espoused liberal ideas , the ideological foundations of the Philippine Revolution which sought independence from Spain . + After the Battle of Manila ( 1898 ) , Spain ceded the surrendered city of Manila to the United States . The First Philippine Republic based at nearby Bulacan fought against the Americans for control of the city of Manila . The Americans defeated the First Philippine Republic and captured president Emilio Aguinaldo who announced allegiance to the United States on April 1 , 1901 . + Upon drafting a new charter for Manila in June 1901 , the Americans made official what had long been tacit : that the City of Manila was not Intramuros alone but also all its arrabales . The new city charter proclaimed that Manila was composed of eleven municipal districts — presumably Tondo , Binondo , Santa Cruz , Sampaloc , San Miguel , Pandacan , Santa Ana , Paco , Malate , Ermita and Intramuros . In addition to these , the Church recognized five parishes as Manileno — namely , Gagalangin , Trozo , Balic @-@ Balic , Santa Mesa and Singalong . Later times would add two more : Balut and San Andres Bukid . + Under American control , a new civilian oriented Insular Government headed by then Governor @-@ General William Howard Taft invited city planner Daniel Burnham for the transformation of Manila , to adapt the old city to changed times and modern needs . The Burnham Plan included development of the road system , the use of waterways for transportation , and beautification of Manila with the improvement of waterfronts , construction of parks , parkways and various building for various activities . + The latter included a government center occupying all of Wallace Field , which extends from Luneta to the present Taft Avenue . The Philippine Capitol was to rise at the Taft Avenue end of the field , facing toward the sea , and would form , with the buildings of different government bureaus and departments , a quadrangle , lagoon in the center , and a monument to José Rizal at its Luneta end . Of Burnham 's proposed government center , only three units — the Legislative Building and the building of the Finance and Agricultural departments — were completed when World War II erupted . + Due to the Japanese occupation of the Philippines , American soldiers were ordered to withdraw from the city and all military installations were removed on December 24 , 1941 . General Douglas MacArthur declared Manila an open city to prevent further death and destruction ; despite this , the Japanese warplanes continued to bomb the city . Manila was occupied by the Japanese forces on January 2 , 1942 . + Manila was also the site of the bloodiest battle in the Pacific theater during the Second World War . After falling to the Empire of Japan on January 2 , 1942 , it was recaptured by joint American and Filipino troops from February 3 to March 3 , 1945 . Some 100 @,@ 000 civilians were killed in Manila in February 1945 . It was the second most devastated city in the world after Warsaw during the Second World War . At the end of World War II , almost all of the structures in the city , particularly Intramuros , were destroyed but after the war , reconstruction took place . + In 1948 , President Elpidio Quirino moved the seat of government of the Philippines to Quezon City , a new capital city in the suburbs and fields northeast of Manila , created in 1938 by former President Manuel L. Quezon , which was named after him . The move ended any implementation of the Burnham Plan 's intent for the government centre to be at Luneta . + With the Visayan @-@ born Arsenio Lacson as its first elected mayor in 1952 ( all mayors were appointed prior to this ) , Manila underwent The Golden Age , once again earning its status as the " Pearl of the Orient " , a moniker it earned before the Second World War . After Lacson 's term in the 1950s , Manila was led by Antonio Villegas for most of the 1960s . Ramon Bagatsing ( an Indian @-@ Filipino ) was mayor for nearly the entire 1970s until the 1986 People Power Revolution . Mayors Lacson , Villegas , and Bagatsing are often collectively considered as the " Big Three of Manila " less for their rather long tenures as the city 's chief executive ( continuously for over three decades , from 1952 – 1986 ) , but more for their indelible contribution to the development and progress of the city and their lasting legacy in uplifting the quality of life and welfare of the people of Manila . + During the administration of President Ferdinand Marcos , the region of the Metro Manila was created as an integrated unit with the enactment of Presidential Decree No. 824 on November 7 , 1975 . The area encompassed four cities and thirteen adjoining towns , as a separate regional unit of government . On the 405th anniversary of the city 's foundation on June 24 , 1976 , Manila was reinstated by Marcos as the capital of the Philippines for its historical significance as the seat of government since the Spanish Period . Presidential Decree No. 940 states that Manila has always been to the Filipino people and in the eyes of the world , the premier city of the Philippines being the center of trade , commerce , education and culture . + During the martial law era , Manila became a hot @-@ bed of resistance activity as youth and student demonstrators repeatedly clashed with the police and military which were subservient to the Marcos regime . After decades of resistance , the non @-@ violent People Power Revolution ( predecessor to the peaceful @-@ revolutions that toppled the iron @-@ curtain in Europe ) , ousted the authoritarian Marcos from power . + In 1992 , Alfredo Lim was elected mayor , the first Chinese @-@ Filipino to hold the office . He was known for his anti @-@ crime crusades . Lim was succeeded by Lito Atienza , who served as his vice @-@ mayor . Atienza was known for his campaign ( and city slogan ) " Buhayin ang Maynila " ( Revive Manila ) , which saw the establishment of several parks and the repair and rehabilitation of the city 's deteriorating facilities . He was the city 's mayor for 3 terms ( 9 years ) before being termed out of office . + Alfredo Lim once again ran for mayor and defeated Atienza 's son Ali in the 2007 city election and immediately reversed all of Atienza 's projects claiming Atienza 's projects made little contribution to the improvements of the city . The relationship of both parties turned bitter , with the two pitting again during the 2010 city elections in which Lim won against Atienza . + Lim was sued by councilor Dennis Alcoreza on 2008 over human rights , charged with graft over the rehabilitation of public schools , and was heavily criticized for his haphazard resolution of the Rizal Park hostage taking incident , one of the deadliest hostage crisis in the Philippines . Later on , Vice Mayor Isko Moreno and 28 city councilors filed another case against Lim in 2012 , stating that Lim 's statement in a meeting were " life @-@ threatening " to them . On the 2013 elections , former President Joseph Estrada defeated Lim in the mayoral race . During his term , Estrada has paid the city 's over ₱ 5 billion debts , increase revenues by 2 @.@ 35 times from ₱ 6 @.@ 2 billion in 2012 to ₱ 14 @.@ 6 billion by 2016 , spent from 2013 to 2016 an unprecedented ₱ 6 @.@ 76 billion for the city 's infrastructure , built and or renovated seven city public markets , built 22 schools , increased teachers ' incomes , modernized the six city hospitals and bought dialysis machines and magnetic resonance imaging scanners , increased the efficiency of the police force and reduced crime . Manila has become the most competitive city in the Philippines by 2015 , making the city the best place for doing business and for living in . + Despite his achievements and unprecedented feat as the Mayor of Manila , however , Estrada was re @-@ elected as Manila mayor in the 2016 election against Lim and Amado Bagatsing , only winning by an infinitesimal 2 @,@ 830 votes . + + = = Geography = = + + Manila is located on the eastern shores of Manila bay , which rests on the western shores of Luzon . Manila lies 800 miles ( 1 @,@ 300 kilometers ) from mainland Asia . The Pasig River bisects Manila . + Almost all of Manila sits on top of centuries of prehistoric alluvial deposits built by the waters of the Pasig and on some land reclaimed from Manila Bay . Manila 's land has been altered substantially by human intervention , with considerable land reclamation along the waterfronts since the American colonial times . Some of the natural variations in topography have been evened out due to the urbanization of the city . As of 2013 , Manila has a total area of 42 @.@ 88 square kilometres ( 16 @.@ 56 sq mi ) . + + = = = Earthquakes = = = + + Manila sits astride the Pacific typhoon belt and is criss @-@ crossed by several fault lines . This led to Manila and its metropolitan region to be ranked as the second riskiest capital ( city ) to live in according to Swiss Re . The seismically active Marikina Valley Fault System poses a threat to Manila and the surrounding regions . + Manila has endured several deadly earthquakes , notably in 1645 and in 1677 which destroyed the stone and brick medieval city . The Earthquake Baroque style was used by the Colonial architects during the Spanish colonial period in order to adapt to the frequent earthquakes . + + = = = Climate = = = + + Under the Köppen climate classification system , Manila features a tropical savanna climate ( Köppen climate classification Aw ) . Together with the rest of the Philippines , Manila lies entirely within the tropics . Its proximity to the equator means that the temperature range is very small , rarely going below 20 ° C ( 68 ° F ) or above 38 ° C ( 100 ° F ) . Temperature extremes have ranged from 14 @.@ 5 ° C ( 58 @.@ 1 ° F ) on January 11 , 1914 to 38 @.@ 6 ° C ( 101 @.@ 5 ° F ) on May 7 , 1915 . + Humidity levels are usually very high all year round . Manila has a distinct dry season from December through May , and a relatively lengthy wet season that covers the remaining period with slightly cooler temperatures . In the rainy season it rarely rains all day but the rainfall is very heavy during short periods . Typhoons usually occur from June to September . + + = = = Environment = = = + + Due to industrial waste and automobiles , Manila suffers from air pollution , affecting 98 % of the population . Annually , the air pollution causes more than 4 @,@ 000 deaths . Ermita is Manila 's most air polluted district due to open dump sites and industrial waste . According to a report in 2003 , The Pasig River is one of the most polluted rivers in the world with 150 tons of domestic waste and 75 tons of industrial waste dumped daily . + Annually , Manila is hit with 6 to 7 typhoons creating floods . In 2009 , Typhoon Ketsana struck Philippines . In the aftermath of Typhoon Ketsana , the lack of infrastructure led to one of the worst floodings in the Philippines and creating a significant amount of pollution . Following the aftermath of Typhoon Ketsana , the city began to dredge its rivers and improve its drainage network . The Pasig River Rehabilitation Commission is in charge of cleaning up the Pasig River and tributaries for transportation , recreation and tourism purposes . Rehabilitation efforts have resulted in the creation of parks along the riverside , along with stricter pollution controls . + + = = Cityscape = = + + + = = = Architecture = = = + + Manila has architecturally significant buildings in a wide range of styles spanning distinct historical and cultural periods . Architectural styles reflect American , Spanish , Chinese , and Malay influences . Prominet Filipino architects such as Antonio Toledo , Felipe Roxas , Juan M. Arellano and Tomás Mapúa have designed significant buildings in Manila such as churches , government offices , theaters , mansions , schools and universities . + Manila is known for its distinct Art Deco theaters which are designed by National Artists such as Juan Nakpil and Pablo Antonio . The historic Escolta Street in Binondo features many buildings of neo @-@ classical and beaux @-@ arts architectural style , many of which were designed by prominent Filipino architects during the American Rule in the 1920s to the late 1930s . Many architects , artists , historians and heritage advocacy groups are pushing for the revival of Escolta Street , which was once the premier street of the Philippines . + Unfortunately , much of Manila 's prewar and Spanish colonial architecture was destroyed during World War II . Reconstruction took place afterwards , replacing the destroyed historic Spanish @-@ era buildings with modern ones , erasing much of the city 's character . Some buildings destroyed by the war have been reconstructed , such as the Old Legislative Building ( National Museum ) , Ayuntamiento de Manila ( Bureau of the Treasury ) and the currently under construction San Ignacio Church ( Museo de Intramuros ) . Plans have been laid out to rehabilitate several neglected historic buildings and places such as Plaza Del Carmen , San Sebastian Church and the Manila Metropolitan Theater and soon Spanish @-@ era shops and houses in Quiapo , Binondo , and San Nicolas will be restored to its former splendor , as a part of a movement to restore Manila to its former glory . + Since Manila is prone to earthquakes , the Spanish colonial architects invented the style called Earthquake Baroque which the churches and government buildings during the Spanish colonial period adopted . As a result , succeeding earthquakes of the 18th and 19th centuries barely affected Manila , although it did periodically level the surrounding area . Modern buildings in and around Manila are designed or have retrofitted to withstand an 8 @.@ 2 magnitude quake in accordance to the country 's building code . + + = = = Barangays and districts = = = + + Manila is composed of fourteen districts according to Republic Act No. 409 , otherwise known as the Revised Charter of the City of Manila . Two were later added , which are Santa Mesa ( partitioned off from Sampaloc ) and San Andres ( partitioned off from Santa Ana ) . + The city has 896 barangays that are known by sequential numbers instead of names . These barangays are further group into 100 zones for administrative and municipal purposes . + + = = = Military and national security = = = + + The headquarters of the Philippine Coast Guard is located at the South Harbor in Port Area near Intramuros and Ermita . The Philippine Navy on the other hand has its headquarters in Naval Station Jose Andrada located along Roxas Boulevard in Malate . Furthermore , the AFP Joint Task Force @-@ National Capital Region was created in 2012 to ensure peace and stability in Metro Manila , of which Manila is a part . It bears the same functions of the deactivated National Capital Regional Command , although it operates on a much smaller size than its predecessor . + + = = = Slums = = = + + There are an estimated 4 million slum dwellers living in Manila as of 2014 . + + = = Demographics = = + + According to the 2015 census , the population of the city was 1 @,@ 780 @,@ 148 , making it the second most populous city in the Philippines . + Manila is the most densely populated city in the world with 43 @,@ 079 inhabitants per km2 . District 6 is listed as being the most dense with 68 @,@ 266 inhabitants per km2 , followed by District 1 with 64 @,@ 936 and District 2 with 64 @,@ 710 , respectively . District 5 is the least densely populated area with 19 @,@ 235 . + Manila 's population density dwarfs that of Kolkata ( 27 @,@ 774 inhabitants per km2 ) , Mumbai ( 22 @,@ 937 inhabitants per km2 ) , Paris ( 20 @,@ 164 inhabitants per km2 ) , Dhaka ( 19 @,@ 447 inhabitants per km2 ) , Shanghai ( 16 @,@ 364 inhabitants per km2 , with its most dense district , Nanshi , having a density of 56 @,@ 785 inhabitants per km2 ) , and Tokyo ( 10 @,@ 087 inhabitants per km2 ) . + The vernacular language is Filipino , based mostly on the Tagalog of surrounding areas , and this Manila form of speaking Tagalog has essentially become the lingua franca of the Philippines , having spread throughout the archipelago through mass media and entertainment . Meanwhile , English is the language most widely used in education , business , and heavily in everyday usage throughout the Metro Manila region and the Philippines itself . + A number of older residents can still speak basic Spanish , which used to be a mandatory subject in the curriculum of Philippine universities and colleges , and many children of Japanese Filipino , Indian Filipino , and other migrants or expatriates also speak their parents ' languages at home , aside from English and / or Filipino for everyday use . Minnan Chinese ( known as Lannang @-@ oe ) is spoken by the city 's Chinese @-@ Filipino community . + + = = Economy = = + + The city is a major center for commerce , banking and finance , retailing , transportation , tourism , real estate , new media as well as traditional media , advertising , legal services , accountancy , insurance , theater , fashion , and the arts in the Philippines . + The Cities and Municipalities Competitiveness Index , published by the National Competitiveness Council of the Philippines , ranks the cities , municipalities and provinces of the country according to their economic dynamism , government efficiency and infrastructure . Manila placed third in the Highly Urbanized City ( HUC ) category . Previously , Manila was the country 's most competitive city in 2015 , making it the best place to live in and do business . + The Port of Manila is the largest seaport in the Philippines , making it the premier international shipping gateway to the country . The Philippine Ports Authority is government agency responsible to oversee the operation and management of the ports . The International Container Terminal Services Inc. cited by the Asian Development Bank as one of the top five major maritime terminal operators in the world has its headquarters and main operations on the ports of Manila . Another port operator , the Asian Terminal Incorporated , has its corporate office and main operations in the Manila South Harbor and its container depository located in Santa Mesa . + Binondo , the oldest and one of the largest Chinatown in the world , was the center of commerce and business activities in the city . Numerous residential and office skyscrapers are found within its medieval streets . Plans to make the Chinatown area into a business process outsourcing ( BPO ) hub progresses and is aggressively pursued by the city government of Manila . 30 buildings are already identified to be converted into BPO offices . These buildings are mostly located along the Escolta Street of Binondo , which are all unoccupied and can be converted into offices . + Divisoria in Tondo is dubbed as the " shopping mecca of the Philippines " . Numerous shopping malls are located in this place , which sells products and goods at bargain price . Small vendors occupies several roads that causes pedestrian and vehicular traffic . A famous landmark in Divisoria is the Tutuban Center , a large shopping mall that is a part of the Philippine National Railways ' Main Station . It attracts 1 million people every month , but is expected to add another 400 @,@ 000 people when the LRT @-@ 2 West Extension is constructed , making it Manila 's busiest transfer station . + Diverse manufacturers within the city produce industrial @-@ related products such as chemicals , textiles , clothing , and electronic goods . Food and beverages and tobacco products also produced . Local entrepreneurs continue to process primary commodities for export , including rope , plywood , refined sugar , copra , and coconut oil . The food @-@ processing industry is one of the most stable major manufacturing sector in the city . + The Pandacan Oil Depot houses the storage facilities and distribution terminals of the three major players in the country 's petroleum industry , namely Caltex Philippines , Pilipinas Shell and Petron Corporation . The oil depot has been a subject of various concerns , including its environmental and health impact to the residents of Manila . The Supreme Court has ordered that the oil depot to be relocated outside the city by July 2015 , but it failed to meet this deadline . It is currently being demolished which is expected to be finished before the year 2016 ends , and plans have been set up to turn this 33 hectare facility into a transport hub or even a food park . + Manila is a major publishing center in the Philippines . Manila Bulletin , the Philippines ' largest broadsheet newspaper by circulation , is headquartered inside Intramuros . Other major publishing companies in the country like The Manila Times , The Philippine Star and Manila Standard Today are headquartered inside the Port Area . The Chinese Commercial News , the Philippines ' oldest existing Chinese @-@ language newspaper , and the country 's third @-@ oldest existing newspaper is headquartered in Binondo . + Manila serves as the headquarters of the Central Bank of the Philippines which is located along Roxas Boulevard . Some universal banks in the Philippines that has its headquarters in the city are the Landbank of the Philippines and Philippine Trust Company . Philam Life Insurance Company , currently the largest life insurance company in the Philippines in terms of assets , net worth , investment and paid @-@ up capital , has its headquarters along United Nations Avenue in Ermita . Unilever Philippines has its corporate office along United Nations Avenue in Paco . Toyota , a company listed in the Forbes Global 2000 also has its regional office along UN Avenue . + + = = = Tourism = = = + + Tourism is a vital industry in Manila , and it welcomes approximately over 1 million tourists each year . Major destinations include the walled city of Intramuros , the National Theater at the Cultural Center of the Philippines , Manila Ocean Park , Binondo , Ermita , Malate , Manila Zoo , National Museum of the Philippines and Rizal Park . + Rizal Park , also known as Luneta Park , is the national park of the country and has an area of 58 hectares ( 140 acres ) , making it the largest urban park in Asia . In the Tourism Act of 2009 , Rizal Park along with Intramuros are designated as flagship destination to become a tourism enterprise zone . A new attraction called Paseo de Manila is expected to rise in the park . The park was constructed as an honor and dedication to the country 's national hero José Rizal , who was executed by the Spaniards on charges of subversion . The flagpole west of the Rizal Monument is the Kilometer Zero marker for distances to the rest of the country . + Intramuros is the historic center of Manila . Originally , it was considered to be Manila itself at the time when the Philippines was under the Spanish Empire colonial rule . Owing to its history and cultural value , Intramuros and Rizal Park are designated as flagship destination to become a tourism enterprise zone in the Tourism Act of 2009 . Intramuros is managed by the Intramuros Administration ( IA ) . + The architecture of Intramuros reflects the Spanish colonial style and the American neoclassical architectural style , since the Philippines was a colony of Spain and the United States before it is granted its independence in 1946 . Kalesa is a popular mode of transportation in Intramuros and nearby places such as Binondo , Ermita and the Rizal Park . + Popular tourist destinations in Intramuros include the Baluarte de San Diego , Club Intramuros Golf Course , Cuartel de Santa Lucia , Fort Santiago , Manila Cathedral , Palacio Arzobispal , Palacio de Santa Potenciana , Palacio del Gobernador , Plaza Mexico , Plaza de Roma , San Agustin Church and the Ayuntamiento de Manila . + Some of the country 's oldest schools are founded in Intramuros , these are the University of Santo Tomas ( 1611 ) , Colegio de San Juan de Letran ( 1620 ) , and Ateneo de Manila University ( 1859 ) . Only Colegio de San Juan de Letran remains at Intramuros ; the University of Santo Tomas transferred to a new campus at Sampaloc in 1927 , and Ateneo left Intramuros for Loyola Heights , Quezon City ( while still retaining " de Manila " in its name ) in 1952 . Other prominent educational institutions include the Manila High School and the University of the City of Manila . + The Department of Tourism designates Manila as the pioneer of medical tourism , expecting it to generate $ 1 billion in revenue annually . However , lack of progressive health system , inadequate infrastructure and the unstable political environment are seen as hindrances for its growth . + + = = = Shopping centers = = = + + Manila is a well @-@ known shopping hub of the country and it has been named as one of the best shopping destinations in Asia . Major shopping malls , markets and bazaars thrives in Manila . + Robinsons Place Manila is the largest shopping mall in the city . The mall was the second and by @-@ far , the largest Robinson Mall ever built by John Gokongwei . SM Supermall maintains presence in the city . One of their shopping mall is the SM City Manila , the first SM Supermall in the city featuring major SM brands like The SM Store , SM Supermarket , SM Cinemas and SM Foodcourt . It is located right beside the Manila City Hall . SM City San Lazaro is the second SM Supermall in Manila . It is located in Santa Cruz . SM City San Lazaro was constructed on the site of the former San Lazaro Hippodrome . The building of the former Manila Royal Hotel in Quiapo which is famed for its revolving restaurant atop is now the SM Clearance Center which was established in 1972 . The site of the first SM Store is located at Carlos Palanca Sr. ( formerly Echague ) Street in San Miguel . + Quiapo is referred as the " Old Downtown " where tiangges , markets , botique shops , music and electronics stores are common . C.M. Recto Avenue is where lots of department stores are located . One of Recto Avenue 's famous destination is Divisoria , home to numerous shopping malls in the city . It is also dubbed as the shopping mecca of the Philippines where everything is sold at bargain price . Binondo , the oldest Chinatown in the world , is the city 's center of commerce and trade for all types of businesses run by Filipino @-@ Chinese merchants with a wide variety of Chinese and Filipino shops and restaurants . + + = = Arts , culture and religion = = + + + = = = Religion = = = + + + = = = = Christianity = = = = + + As a result of Spanish cultural influence , Manila is a predominantly Christian ( Catholic ) city . As of 2010 , Roman Catholics comprises 83 @.@ 5 % of the population , followed by adherents of the Philippine Independent Church ( 2 @.@ 4 % ) ; Iglesia ni Cristo ( 1 @.@ 9 % ) ; various Protestant churches ( 1 @.@ 8 % ) ; and Buddhists ( 1 @.@ 1 % ) . Members of Islam and other religions comprises the remaining 10 @.@ 4 % of the city 's population . + Manila is the site of prominent Catholic churches and institutions . The Manila Cathedral is the seat of the Roman Catholic Archdiocese of Manila and the oldest established church in the country . Aside from the Manila Cathedral , there are also three other basilicas in the city : Quiapo Church , Binondo Church , and the Minor Basilica of San Sebastián . The San Agustín Church in Intramuros is a UNESCO World Heritage Site and is one of the two fully air @-@ conditioned Catholic churches in the city . Manila also has other parishes located throughout the city , with some of them dating back to the Spanish Colonial Period when the city serves as the base for numerous Catholic missions both within the Philippines and to Asia beyond . + Several Mainline Protestant denominations are headquartered in the city . St. Stephen 's Parish pro @-@ cathedral in the Sta . Cruz district is the see of the Episcopal Church in the Philippines ' Diocese of Central Philippines , while align Taft Avenue are the main cathedral and central offices of the Iglesia Filipina Independiente ( also called the Aglipayan Church , a national church that was a product of the Philippine Revolution ) . Other faiths like The Church of Jesus Christ of Latter @-@ day Saints maintains a presence in the city . + The indigenous Iglesia ni Cristo has several locales ( akin to parishes ) in the city , including its very first chapel ( now a museum ) in Punta , Sta . Ana . Evangelical , Pentecostal and Seventh @-@ day Adventist denominations also thrives within the city . The headquarters of the Philippine Bible Society is in Manila . Also , the main campus of the Cathedral of Praise is located along Taft Avenue . Jesus Is Lord Church also has several branches and campuses in Manila , and celebrates its anniversary yearly at the Burnham Green and Quirino Grandstand in Rizal Park . + + = = = = Other faiths = = = = + + The city also hosts other religions . There are many Buddhist and Taoist temples serving the Chinese Filipino community . Quiapo is home to a sizable Muslim population which worships at Masjid Al @-@ Dahab . Members of the Indian expatriate population have the option of worshiping at the large Hindu temple in the city , or at the Sikh gurdwara along United Nations Avenue . The National Spiritual Assembly of the Bahá 'ís of the Philippines , the governing body of the Filipino Bahá 'í community , is headquartered near Manila 's eastern border with Makati . + + = = = Annual cultural events and religious festivities = = = + + Manila celebrates civic and national holidays . Manila Day , which celebrates the city 's founding on June 24 , 1571 , was first proclaimed by Herminio A. Astorga ( then Vice Mayor of Manila ) on June 24 , 1962 and has been annually commemorated , under the patronage of John the Baptist . Locally , each of the city 's barangays also have their own festivities guided by their own patron saint . The city is also the host to the Feast of the Black Nazarene , held every January 9 , which draws millions of Catholic devotees . Another religious feasts held in Manila was the Feast of the Nuestra Señora de los Desamparados de Manila ( Our Lady of the Abandoned ) , the patron saint of Santa Ana and was held every May 12 . Non @-@ religious holidays include the New Year 's Day , National Heroes ' Day , Bonifacio Day and Rizal Day . + + = = = Museums and art galleries = = = + + As the cultural center of the Philippines , Manila is the home to a number of museums . The National Museum of the Philippines Complex , which include the National Museum of Fine Arts , Museum of Anthropology and the Museum of Natural History , is located on the northeast part of Rizal Park facing Taft Avenue. proposed national government center during the American time . Museums established by educational institutions include the Mabini Shrine , the Museum of Contemporary Art and Design , UST Museum of Arts and Sciences , and the UP Museum of a History of Ideas . + Bahay Tsinoy , one of Manila 's most prominent museums , documents the Chinese lives and contributions in the history of the Philippines . The Intramuros Light and Sound Museum chronicles the Filipinos desire for freedom during the revolution under Rizal 's leadership and other revolutionary leaders . The Metropolitan Museum of Manila exhibits the Filipino arts and culture . + Other museums in the city are the Museum of Manila , the city @-@ owned museum that exhibits the city 's culture and history , Museo Pambata , a children 's museum , the Museum of Philippine Political History , which exhibits notable political events in the country , the Parish of the Our Lady of the Abandoned and the San Agustin Church Museum , which houses religious artifacts , and Plaza San Luis , a public museum . + + = = Sports = = + + Sports in Manila have a long and distinguished history . The city 's , and in general the country 's main sport is basketball , and most barangays have a makeshift basketball court , with court markings drawn on the streets . Larger barangays have covered courts where interbarangay leagues are held every summer ( April to May ) . + The city has several well @-@ known sports venues , such as the Rizal Memorial Sports Complex and San Andres Gym , the home of the now defunct Manila Metrostars . The Rizal Memorial Sports Complex houses the Rizal Memorial Track and Football Stadium , the Baseball Stadium , Tennis Courts , Memorial Coliseum and the Ninoy Aquino Stadium ( the latter two are indoor arenas ) . + The Rizal complex had hosted several multi @-@ sport events , such as the 1954 Asian Games and the 1934 Far Eastern Games . Whenever the country hosts the Southeast Asian Games , most of the events are held at the complex , but in the 2005 Games , most events were held elsewhere . The 1960 ABC Championship and the 1973 ABC Championship , forerunners of the FIBA Asia Championship , was hosted by the complex , with the national basketball team winning on both tournaments . The 1978 FIBA World Championship was held at the complex although the latter stages were held in the Araneta Coliseum in Quezon City , Southeast Asia 's largest indoor arena at that time . + Manila also hosts several well @-@ known sports facilities such as the Enrique M. Razon Sports Center and the University of Santo Tomas Sports Complex , both of which are private venues owned by a university ; collegiate sports are also held , with the University Athletic Association of the Philippines and the National Collegiate Athletic Association basketball games held at Rizal Memorial Coliseum and Ninoy Aquino Stadium , although basketball events had transferred to San Juan 's Filoil Flying V Arena and the Araneta Coliseum in Quezon City . Other collegiate sports are still held at the Rizal Memorial Sports Complex . Professional basketball also used to play at the city , but the Philippine Basketball Association now holds their games at Araneta Coliseum and Cuneta Astrodome at Pasay ; the now defunct Philippine Basketball League played some of their games at the Rizal Memorial Sports Complex . + The Manila Storm are the city 's rugby league team training at Rizal Park ( Luneta Park ) and playing their matches at Southern Plains Field , Calamba , Laguna . + Previously a widely played sport in the city , Manila is now the home of the only sizable baseball stadium in the country , at the Rizal Memorial Baseball Stadium . The stadium hosts games of Baseball Philippines ; Lou Gehrig and Babe Ruth were the first players to score a home run at the stadium at their tour of the country on December 2 , 1934 . + Another popular sport in the city are cue sports , and billiard halls are a feature in most barangays . The 2010 World Cup of Pool was held at Robinsons Place Manila . + The Rizal Memorial Track and Football Stadium hosted the first FIFA World Cup qualifier in decades when the Philippines hosted Sri Lanka in July 2011 . The stadium , which was previously unfit for international matches , had undergone a major renovation program prior to the match . The Football Stadium now regularly hosts matches of the United Football League . The stadium also hosted its first rugby test when it hosted the 2012 Asian Five Nations Division I tournaments . + + = = Government and politics = = + + The government of Manila is divided into three branches : executive , legislative and judiciary . The judicial branch is administered solely by the Supreme Court of the Philippines under the Metro Manila judicial region . The city government have control of the executive and legislative branch . Manila employs 11 @,@ 919 personnel at the end of 2014 . + The current Mayor of Manila is Joseph Estrada , who served as the President of the Philippines from 1998 @-@ 2001 . He is also the head of the executive department of the city . The legislative arm which is composed of six elected city councilors , is headed by the Vice Mayor . Former actor Isko Moreno currently serves as the city 's vice mayor . Altogether they are assisted by the Manila City Council , the local President of the Association of Barangay Captains , and the President of the Sangguniang Kabataan . Their offices are located at the Manila City Hall . + + = = = Finance = = = + + On September 25 , 2014 , the Commission on Audit released its 2013 Annual Financial Report citing the city 's income at ₱ 10 @.@ 1 billion with an asset worth of ₱ 18 @.@ 6 billion . Its local income stood at ₱ 5 @.@ 41 billion and its national government allocation was ₱ 1 @.@ 74 billion , having an annual regular income ( ARI ) of an estimated ₱ 7 @.@ 15 billion . Manila 's net income stood at ₱ 3 @.@ 54 billion in 2014 . + Among the local government units , Manila has the highest budget allocation to health . It was also one of the cities with the highest tax and internal revenue . Tax revenue accounts for 46 % of the city 's income in 2012 . + + = = = Districts and barangays = = = + + Manila has six legislative districts that serve as the constituencies for the election of the city 's representatives to the lower house of the Congress of the Philippines and of the regular members to the Sangguniang Panlungsod ( SP ; City Council ) . Each district elects one representative to the House of Representatives and six SP members to the council . The city , along with the rest of the nation , elects 12 senators as one at @-@ large district . + Manila is politically divided into 896 barangays , the smallest unit of local government in the Philippines . Each barangay has its own chairperson and councilors . For administrative convenience , all the barangays in Manila are grouped into 100 zones . These zones have no form of local government . + The 1st District ( 2015 population : 415 @,@ 906 ) is Manila 's ( and the country 's ) most densely populated congressional district . It covers the western portion of Tondo that lies along Manila Bay . + The 2nd District ( 2015 population : 215 @,@ 457 ) covers the eastern inland portion of Tondo , a neighborhood or sub @-@ district known as Gagalangin . + The 3rd District ( 2015 population : 197 @,@ 242 ) covers Binondo , Quiapo , San Nicolas and Santa Cruz . + The 4th District ( 2015 population : 265 @,@ 046 ) covers Sampaloc . + The 5th District ( 2015 population : 366 @,@ 714 ) covers Ermita , Malate , Port Area , Intramuros , San Andres Bukid , and a portion of Paco ( except Zone 90 ) . + The 6th District ( 2007 population : 295 @,@ 245 ) covers Paco ( Zone 90 only ) , Pandacan , San Miguel , Santa Ana and Santa Mesa . + Manila has the most number of barangays of any city or municipality in the Philippines . Attempts at reducing its number have not prospered despite local legislation — Ordinance 7907 , passed on 23 April 1996 — reducing the number from 897 to 150 by merging existing barangays , because of the failure to hold a plebiscite . + + = = = National government = = = + + Manila , being the seat of political power of the Philippines , has several national government offices headquartered at the city . Planning for the development for being the center of government started during the early years of American colonization to the country when they envisioned a well @-@ designed city outside the walls of Intramuros . The strategic location chosen was Bagumbayan , a former town which is now the Rizal Park to become the center of government and a design commission was given to Daniel Burnham to create a master plan for the city patterned after Washington D.C .. These improvements were eventually abandoned under the Commonwealth Government of Manuel L. Quezon . + A new government center was to be built on the hills northeast of Manila , or what is now Quezon City . Several government agencies have set up their headquarters in Quezon City but several key government offices still reside in Manila . However , many of the plans were substantially altered after the devastation of Manila during World War II and by subsequent administrations . + The city , as the capital , still hosts the Office of the President , as well as the president 's official residence . Aside from these , important institutions such as the Supreme Court , the Court of Appeals , the Bangko Sentral ng Pilipinas , the Departments of Budget and Management , Finance , Health , Justice , Labor and Employment and Public Works and Highways still call the city home . Manila also hosts important national institutions such as the National Library , National Archives , National Museum and the Philippine General Hospital . + Congress previously held office at the Old Congress Building . In 1972 , due to declaration of martial law , Congress was dissolved ; its successor , the unicameral Batasang Pambansa , held office at the new Batasang Pambansa Complex . When a new constitution restored the bicameral Congress , the House of Representatives stayed at the Batasang Pambansa Complex , while the Senate remained at the Old Congress Building . In May 1997 , the Senate transferred to a new building it shares with the Government Service Insurance System at reclaimed land at Pasay . + + = = Infrastructure = = + + + = = = Utilities = = = + + + = = = = Water and electricity = = = = + + Water services used to be provided by the Metropolitan Waterworks and Sewerage System , which served 30 % of the city with most other sewage being directly dumped into storm drains , septic tanks , or open canals . MWSS was privatized in 1997 which split the water concession into the east and west zones . The Maynilad Water Services took over the west zone of which Manila is a part . It now provides the supply and delivery of potable water and sewerage system in Manila , but it does not provide service to the southeastern part of the city which belongs to the east zone that is served by Manila Water . Electric services are provided by Meralco , the sole electric power distributor in Metro Manila . + + = = = Transportation = = = + + One of the more famous modes of transportation in Manila is the jeepney . Patterned after U.S. army jeeps , these have been in use since the years immediately following World War II . The Tamaraw FX , the third generation Toyota Kijang , which competed directly with jeepneys and followed fixed routes for a set price , once plied the streets of Manila . + On a for @-@ hire basis , the city is served by numerous taxicabs , " tricycles " ( motorcycles with sidecars , the Philippine version of the auto rickshaw ) , and " trisikads " or " sikads " ( bicycles with a sidecars , the Philippine version of pedicabs ) . In some areas , especially in Divisoria , motorized pedicabs are popular . Spanish @-@ era horse @-@ drawn calesas are still a popular tourist attraction and mode of transportation in the streets of Binondo and Intramuros . All types of public road transport are privately owned and operated under government franchise . + The city is serviced by the LRT @-@ 1 and LRT @-@ 2 which forms the LRTA system , as distinct from the MRT @-@ 3 which is under the MRTC system that services other parts of Metro Manila . Development of the railway system began in the 1970s under the Marcos administration , making it the first light rail transport in Southeast Asia . These systems are currently undergoing a multibillion @-@ dollar expansion . LRT Line 1 runs along the length of Taft Avenue ( R @-@ 2 ) and Rizal Avenue ( R @-@ 9 ) , and the LRT Line 2 runs along Claro M. Recto Avenue ( C @-@ 1 ) and Ramon Magsaysay Boulevard ( R @-@ 6 ) from Santa Cruz , through Quezon City , up to Santolan in Marikina . + The main terminal of the Philippine National Railways lies within the city . One commuter railway within Metro Manila is in operation . The line runs in a general north @-@ south direction from Tutuban ( Tondo ) toward Laguna . The Port of Manila , located in the vicinity of Manila Bay is the chief seaport of the Philippines . The Pasig River Ferry Service which runs on the Pasig River is another form of transportation . The city is also served by the Ninoy Aquino International Airport and Clark International Airport . + In 2006 , Forbes magazine ranked Manila " the world 's most congested city " . Manila has become notorious for its frequent traffic jams and high densities . The government has undertaken several projects to alleviate the traffic in the city . Some of the projects include : the construction of a new flyover at Sampaloc , the construction of the Metro Manila Skyway Stage 3 , the proposed LRT Line 2 ( west ) extension from Recto to Tondo or the Port Area , and the expansion of several national and local roads . However , such projects have yet to make any meaningful impact , and the traffic jams and congestion continue unabated . The urban planning of the Manila and the whole metropolis was based on the Metro Manila Dream Plan , which seeks to address the problems of Metro Manila 's urban planning and transportation . It consists of a list of short term priority projects and medium to long term infrastructure projects that will last up to 2030 . + + = = Healthcare = = + + The Manila Health Department is responsible for the planning and implementation of the health care programs provided by the city government . It operates 59 health centers and six city @-@ run hospitals , which are free of charge . The six public city @-@ run hospitals are the Ospital ng Maynila Medical Center , Ospital ng Sampaloc , Gat Andres Bonifacio Memorial Medical Center , Ospital ng Tondo , Sta . Ana Hospital , and Justice Jose Abad Santos General Hospital . Manila is also the site of the Philippine General Hospital , the tertiary state @-@ owned hospital administered and operated by the University of the Philippines Manila . + Manila 's healthcare is also provided by private corporations . Private hospitals that operates in the city are the Manila Doctors Hospital , Chinese General Hospital and Medical Center , Dr. José R. Reyes Memorial Medical Center , Metropolitan Medical Center , Our Lady of Lourdes Hospital , and the University of Santo Tomas Hospital . + The Department of Health has its main office in Manila . The national health department also operates the San Lazaro Hospital , a special referral tertiary hospital . Manila is also the home to the headquarters of the World Health Organization Regional Office for the Western Pacific and the World Health Organization Country Office for the Philippines . + + = = Education = = + + The center of education since the colonial period , Manila — particularly Intramuros — is home to several Philippine universities and colleges as well as its oldest ones . It served as the home of the University of Santo Tomas ( 1611 ) , Colegio de San Juan de Letran ( 1620 ) , Ateneo de Manila University ( 1859 ) , Lyceum of the Philippines University and the Mapua Institute of Technology . Only Colegio de San Juan de Letran ( 1620 ) remains at Intramuros ; the University of Santo Tomas transferred to a new campus at Sampaloc in 1927 , and Ateneo left Intramuros for Loyola Heights , Quezon City ( while still retaining " de Manila " in its name ) in 1952 . + The University of the City of Manila ( Pamantasan ng Lungsod ng Maynila ) located at Intramuros , and Universidad de Manila located just outside the walled city , are both owned and operated by the Manila city government . The national government controls the University of the Philippines Manila , the oldest of the University of the Philippines constituent universities and the center of health sciences education in the country . The city is also the site of the Polytechnic University of the Philippines , the largest university in the country in terms of student population . + The University Belt refers to the area where there is a high concentration or a cluster of colleges and universities in the city and it is commonly understood as the one where the San Miguel , Quiapo and Sampaloc districts meet . Generally , it includes the western end of España Boulevard , Nicanor Reyes St. ( formerly Morayta St. ) , the eastern end of Claro M. Recto Avenue ( formerly Azcarraga ) , Legarda Avenue , Mendiola Street , and the different side streets . Each of the colleges and universities found here are at a short walking distance of each other . Another cluster of colleges lies along the southern bank of the Pasig River , mostly at the Intramuros and Ermita districts , and still a smaller cluster is found at the southernmost part of Malate near the border with Pasay such as the private co @-@ educational institution of De La Salle University , the largest of all De La Salle University System of schools . + The Division of the City Schools of Manila , a branch of the Department of Education , refers to the city 's three @-@ tier public education system . It governs the 71 public elementary schools , 32 public high schools . + The city also contains the Manila Science High School , the pilot science high school of the Philippines ; the National Museum , where the Spoliarium of Juan Luna is housed ; the Metropolitan Museum of Manila , a museum of modern and contemporary visual arts ; the Museo Pambata , the Children 's Museum , a place of hands @-@ on discovery and fun learning ; and , the National Library , the repository of the country 's printed and recorded cultural heritage and other literary and information resources . + + = = Global outreach = = + + + = = = Twin towns – Sister cities = = = + + Sister cities of Manila + + = = = = Asia / Pacific Rim = = = = + + + = = = = North America = = = = + + + + = Attalea ( palm ) = + + Attalea is a large genus of palms native to Mexico , the Caribbean , Central and South America . This pinnately leaved , non @-@ spiny genus includes both small palms lacking an aboveground stem and large trees . The genus has a complicated taxonomic history , and has often been split into four or five genera based on differences in the male flowers . Since the genera can only be distinguished on the basis of their male flowers , the existence of intermediate flower types and the existence of hybrids between different genera has been used as an argument for keeping them all in the same genus . This has been supported by a recent molecular phylogeny . + Somewhere between 29 and 67 species are recognised in the genus , with estimates of as many as 100 . Incomplete herbarium collections make it difficult to determine whether certain groups represent single species , or groups of similar species . Attalea species have a long history of human use , and include economically important sources of palm oil and fibre . Many species are fire tolerant and thrive in disturbed habitats . Their seeds are animal dispersed , including some which are thought to have been adapted for dispersal by now @-@ extinct Pleistocene megafauna . + + = = Description = = + + The genus Attalea has pinnately compound leaves — rows of leaflets emerge on either side of the axis of the leaf in a feather @-@ like or fern @-@ like pattern . Species are also non @-@ spiny palms and includes both large trees with stout stems up to 30 metres ( 98 ft ) tall and acaulescent palms ( one which lack an aboveground stem ) . The number of leaves per individual varies from about three to thirty @-@ five ; larger plants tend to have more and longer leaves . + Inflorescences are large , branched and borne among the leaves . The inflorescence consists of a main axis — the peduncle and the rachis — and a series of smaller branches , the rachillae . The rachillae , which bear the flowers , emerge from the rachis . The peduncle is the main stalk , connecting the rachis with the stem . Inflorescences either consist entirely of male flowers , or are predominantly female with a few male flowers . Fruit usually have two or three seeds , although fewer or more are present in some species , and are usually brown , yellow , orange @-@ brown or purple when mature . + Four different types of male flowers exist . On the basis of these flower types , the genus has often been split into four genera — a more narrowly defined Attalea , Orbignya , Maximiliana and Scheelea . The species sometimes referred to Orbignya have coiled anthers , while the other groups have straight ones . The petals of those placed in Maximiliana are much shorter than the stamens , while those placed in Scheelea and a more narrowly defined Attalea have petals that are longer than the stamens . Five species do not fit easily into any of these groups ; this fact has been used as an argument in favour of considering this group a single genus . + + = = Taxonomy = = + + Attalea has been placed in the subfamily Arecoideae , the tribe Cocoseae and the subtribe Attaleinae , together with the genera Allagoptera , Beccariophoenix , Butia , Cocos , Jubaea , Jubaeopsis , Lytocaryum , Parajubaea , Syagrus and Voanioala . Within this subtribe , Attalea has been found to be a monophyletic group , and sister to the clade containing Allagoptera , Polyandrococos , Parajubaea , Butia and Jubaea . + Disagreement exists as to whether Attalea should be considered a single genus , or a group of related genera . In their 1996 Field Guide to the Palms of the Americas , Andrew Henderson , Gloria Galeano and Rodrigo Bernal combined all the species in the subtribe Attaleinae ( as it was then defined ) into a single genus , Attalea . In his 1999 Taxonomic Treatment of Palm Subtribe Attaleinae , American botanist Sidney F. Glassman divided the group into five genera — a more narrowly defined Attalea , Orbignya , Maximiliana , Scheelea and Ynesa . Rafäel Govaerts and John Dransfield recognised a single genus in their 2005 World Checklist of Palms , and Jean @-@ Christophe Pintaud continued this usage in his 2008 review of the genus . + The multi @-@ genus approach is based solely on the structure of the male flowers ; no other characters could be consistently associated with one genus or another . Four of the genera — Attalea ( in a narrow sense ) , Orbignya , Maximiliana and Scheelea — correspond to four different types of male flowers found within the genus . However , a few species have flowers that are intermediate between these four types , including A. colenda ( which Glassman placed in its own genus , Ynesa ) and this has been used as an argument for the single @-@ genus approach . In addition , there are several hybrids between species that would be considered different genera under Glassman 's five @-@ genus system , which has also been used as an argument for placing them in a single genus . In 2009 Alan Meerow and colleagues published a molecular phylogeny of the subtribe which found that some species placed in Orbignya were actually more closely related to species placed in Scheelea than they were to other members of that genus ( if the five @-@ genus approach was used ) , while A. crassispatha , placed in Orbignya by Glassman , was actually a sister to both Scheelea and Orbignya . + + = = = History = = = + + The genus Attalea was first described Carl Sigismund Kunth in 1816 based on specimens collected by Alexander von Humboldt and Aimé Bonpland , although older , pre @-@ Linnaean descriptions exist , including Charles Plumier 's 1703 description of A. crassispatha . The genus was named for Attalus III Philometor , king of Pergamon , known for his interest in medicinal plants . The type species is A. amygdalina , a Colombian endemic . The genera Maximiliana and Orbignya were described by Carl Friedrich Philipp von Martius in 1826 and 1837 respectively . Scheelea was described by Hermann Karsten in 1857 , and Ynesa by Orator F. Cook in 1942 . + + = = = Species = = = + + Experts disagree about the number of species in the genus Attalea ( broadly defined ) . In 1965 , Dutch taxonomist Jan Gerard Wessels Boer estimated that there may be as many as 100 species in the genus . In their 1996 Field Guide to the Palms of the Americas Andrew Henderson and coauthors recognised 29 species in the genus , while Sidney Glassman recognised 65 species in his 1999 treatment of the group . Largely following Glassman 's lead , Rafaël Govaerts and John Dransfield recognised 67 species in their 2005 World Checklist of Palms . An important element of this disagreement is the decision by Glassman to define species more narrowly than Henderson . As a result , what Henderson interpreted as variation within species , Glassman took as differences between morphologically similar species . This problem is complicated by the fact that many of these species are poorly represented in herbarium collections . The large size of the leaves , inflorescences and fruit of many Attalea species makes them difficult to collect . In addition , many important collections , including type specimen , have been lost or destroyed . Sparse or incomplete collections make it difficult to differentiate variation within a single species from variation between different species . + The three recent treatments ( Henderson and coauthors , Glassman , and Govaerts and Dransfield ) recognised a total of 73 species , but only 20 species are accepted by all of them . The remainder account for either nine species or more than 40 . For example , what Andrew Henderson considered a single species , Attalea attaleoides , other authors have considered a species complex consisting of four or five species . Glassman doubted the validity of A. attaleoides as a species , and described four new species from material that had previously been attributed to A. attaleoides — A. camopiensis , A. degranvillei , A. guianensis and A. maripensis . Govaerts and Dransfield accepted both Glassman 's four species and A. attaleoides . However , Jean @-@ Christophe Pintaud was of the opinion that A. guianensis , A. maripensis and A. attaleoides were all very similar , and thought it likely that they all represented the same species . + Another species complex in Attalea includes A. speciosa and related species . Henderson ( 1995 ) recognised A. speciosa and A. spectabilis , considering the latter to either be an acaulescent form of A. speciosa or a hybrid between it and A. microcarpa . Govaerts and Dransfield accepted A. spectabilis , but Glassman considered it a dubious taxon . Attalea vitrivir was recognised as a distinct species by Michael Balick and coauthors ; Glassman and Govaerts and Dransfield concurred , but Henderson considered it part of A. speciosa . Glassman also described a fourth member of this group , A. brejinhoensis , and it is accepted by Govaerts and Dransfield . + + = = Reproduction and growth = = + + Attalea species are monoecious — male and female flowers are separate , but are borne by the same plant . Various species have been described as being insect @-@ pollinated , including A. phalerata , while pollination in A. colenda and A. speciosa , has been attributed both to insects and wind . + Seed germination is remote tubular — during germination , as the cotyledon expands it pushes the young shoot away from the seed . After germination , the stem initially grows downward before turning to grow upward and produce the aboveground stem . This produces a " saxophone shaped " belowground portion of the stem . The fact that the shoot tips of Attalea seedlings are underground it likely to contribute to their fire @-@ tolerance . + + = = Distribution = = + + Species range across the Neotropics from Mexico in the north to Bolivia , Paraguay , and southern Brazil in the south . According to Govaerts and coauthors , three species are found in Mexico , four in Central America , and 62 in South America . Three species are present in the Caribbean — two in Trinidad and Tobago , along the southern edge of the region , and one in Haiti . + + = = Habitat and ecology = = + + Attalea includes both large trees and small acaulescent palms which occupy a number of different ecological niches . Dense stands of some of the larger species are conspicuous elements on the landscape , while smaller species are found in both in the forest understorey and in savannas . + Disturbance has been implicated in the formation of vegetation dominated by large Attalea species . In seasonally dry Amazonian forests the density of large adult A. maripa palms was correlated with canopy openness ; the species also dominates savannas formed by repeated forest fires in Trinidad and Tobago . Attalea speciosa forms pure stands in many parts of Brazil where natural forest vegetation has been cleared . Similarly , stands of A. funifera in Bahia , Brazil ( which are cultivated for piassava fibre ) are managed using fire — the seedlings survive cutting and burning , and are able to dominate burned forest patches . + The fruit are dispersed by animals ; fruit which are not dispersed frequently suffer seed predation by bruchid beetles . Certain species of Attalea have been mentioned as examples of " anachronistic " species which are adapted for dispersal by now @-@ extinct Pleistocene megafauna . On Maracá Island , Roraima , in the Brazilian Amazon , Attalea maripa fruit were consumed by tapirs , collared peccaries , deer and primates . Rodents , including agoutis , fed upon the fruit and , as the fruit availability declined , they fed on the seeds . Other dispersers of Attalea fruit include Crested Caracaras which consume the fruit and disperse the seeds of A. phalerata in the Brazilian Pantanal . + + = = Uses = = + + Attalea species have a long history of human utilisation . Carbonised Attalea maripa seeds have been found in archaeological sites in Colombia dating back to 9000 BP . A variety of species remain important sources of edible oil , thatch , edible seeds and fibre . The leaves of Attalea butyracea and A. maripa are used extensively for thatching . Several species are oil palms , with A. speciosa among the most important economically . Products extracted from A. speciosa were reported to support over 300 @,@ 000 households in the Brazilian state of Maranhão in 2005 , and in 1985 it was estimated to support over 450 @,@ 000 households throughout the Brazil . Piassava fibres , extracted from the leaf bases of A. funifera , are commercially important , and generated about US $ 20 million in annual income to Brazilian farmers in 1996 . + + + = The Heart of Ezra Greer = + + The Heart of Ezra Greer is a 1917 American silent drama film produced by the Thanhouser Company and directed by Emile Chautard . The film focuses on Ezra Greer , a successful middle @-@ aged man who searches for his college age daughter , Mary . The wayward Mary was romanced and abandoned by Jack Denbeigh , later bearing his child . Once Ezra becomes broke he finds employment as the valet for Jack Denbeigh . After Jack 's engagement to a cabaret girl , Mary becomes upset and leaves her child at Jack 's home . Contrary to Jack 's wishes , Ezra keeps the child and Jack ultimately reveals that the child is his own . Ezra convinces Jack to make things right and Ezra convinces the cabaret girl to leave Jack . After a carriage accident in which the baby is injured , Ezra and Jack rush to the hospital and find Mary as a nurse crying over the child . The film ends with the marriage of Jack and Mary . The film was released by Pathé on October 7 , 1917 . The film was the final release from Thanhouser and was deemed to be an average film by most reviewers . Criticism for the film hinged on far @-@ fetched coincidences to drive the plot . The film is presumed lost . + + = = Plot = = + + The film follows Ezra Greer , a middle @-@ aged man who has worked hard since his youth . He cares deeply for his motherless daughter , Mary , but was unable to attend the annual commencement at her co @-@ educational college . He awaits for her to return from college , but Mary leaves with her romantic interest , Jack Denbeigh . On promise of marriage and wealth , Mary is romanced and gives birth to a fatherless child . Without word from his daughter , Ezra resigns from his job and attempts to seek her out and finds a poor motherless child , Marie . With Ezra 's money exhausted he seeks employment and finds it as the valet of Jack . + One day , Mary seeks an announcement of Jack 's engagement to a cabaret girl known as " The Baby Vamp " . Bitter over the prospect of her child 's future , she leaves the child at Jack 's home during his absence with a note . Jack orders Ezra to take the baby to an orphanage , but Marie begs Ezra to keep him . After continually seeing the child , Jack is overcome with remorse and explains to Ezra and seeks his advice . Not knowing he was making the case for his own daughter , Ezra convinces Jack to seek out Mary and forget the Baby Vamp . The Baby Vamp seeks out Jack , but finds Ezra who convinces her to leave Jack . Jack 's son is later injured in a coach accident and is taken to the hospital . Jack and Ezra rush to the hospital and find Mary , as a nurse , crying over the injured child . Ezra is enraged upon learning that his own daughter was mistreated by Jack , but Mary steps between the two men . Jack apologizes and wants to make it right . The film concludes with Jack and Mary . + + = = Cast = = + + Frederick Warde as Ezra Greer + Leila Frost as Mary + George Forth as Jack Denbeigh + Thomas A. Curran as Denbeigh 's guardian + Lillian Mueller as Amy Devers + Carey L. Hastings as Denbeigh 's housekeeper + Helen Badgley as the poor little girl + Gerald Badgley as the millionaire 's baby + W. Ray Johnston + + = = Production = = + + The film was the final production and release of the Thanhouser Company and it was to be released through Pathé . Numerous factors would play into the winding down and eventual closing of the Thanhouser Film Corporation with much advance notice by Edwin Thanhouser . Q. David Bowers writes that it was easy to understand Thanhouser 's decision to retire due to numerous aspects including that releases through Pathé were based on their decision to release or discard the work , the New Rochelle studio was 2 @,@ 500 miles from the center of the trade activity and the slump in industry tied to World War I. Weeks before the film was released , Variety told of the winding down of the Thanhouser with the studio 's staff consisting of Edwin Thanhouser and the bookkeeper , Jessie B. Bishop . The article concluded with the announcement that Lloyd F. Lonergan , the scenario writer of the company , had retired from the company . As it wound down , the Thanhouser Company was announced to have no liabilities would close with a positive bank balance . Little is known of the production of this final film , but it was directed by Emile Chautard from a scenario written by Lloyd F. Lonergan . The cameraman was Jacques Bizeul . + + = = Release and reception = = + + The five reel film was released through the Pathé Exchange as a Pathé Gold Rooster Play on October 7 , 1917 . Charles E. Wagner of the Exhibitor 's Trade Review found it to be a good film with great direction and photography , but was concerned that the stunt in which the baby appeared to be involved in the accident was too real . Wagner stated the film had sufficient action and pathos without sexual suggestiveness ; which should prove a strong program for the Pathé program . Frances Agnew of The Morning Telegraph found it to be an average picture that was not exceptional for audiences , but it would hold sentimental appeal for the average viewer . A reviewer for the The New York Dramatic Mirror found the film 's excessive use of coincidental meetings to be highly improbable , but found Warde 's performance to be excellent and the rest of the cast give good performances . The reviewer said that Emile Chautard had made the improbable story more plausible . + Like many American films of the time , The Heart of Ezra Greer was subject to cuts by city and state film censorship boards . The Chicago Board of Censors required the cutting in Reel 2 of a letter stating , " I cannot face my father , " etc . , and two closeups of gambling scenes ; and in Reel 5 a change of the intertitle " Because it means her whole future " to " Because she is his wife " . + + + = Free Derry = + + Free Derry ( Irish : Saor Dhoire ) was a self @-@ declared autonomous nationalist area of Derry , Northern Ireland , that existed between 1969 and 1972 . Its name was taken from a sign painted on a gable wall in the Bogside in January 1969 which read , " You are now entering Free Derry " . The area , which included the Bogside and Creggan neighbourhoods , was secured by community activists for the first time on 5 January 1969 following an incursion into the Bogside by members of the Royal Ulster Constabulary ( RUC ) . Residents built barricades and carried clubs and similar arms to prevent the RUC from entering . After six days the residents took down the barricades and RUC patrols resumed , but tensions remained high over the following months . + Violence reached a peak on 12 August 1969 , culminating in the Battle of the Bogside — a three @-@ day pitched battle between residents and the RUC . On 14 August units of the British Army were deployed at the edge of the Bogside and the RUC were withdrawn . The Derry Citizens Defence Association ( DCDA ) declared their intention to hold the area against both the RUC and the British Army until their demands were met . The British Army made no attempt to enter the area . The situation continued until October 1969 when , following publication of the Hunt Report , military police were allowed in . + The Irish Republican Army ( IRA ) began to re @-@ arm and recruit after August 1969 . In December 1969 it split into the Official IRA and the Provisional IRA . Both were supported by the people of the Free Derry area . Meanwhile , relations between the British Army and the nationalist community , which were initially good , deteriorated . In July 1971 there was a surge of recruitment into the IRA after two young men were shot and killed by British troops . The government introduced internment on 9 August 1971 , and in response , barricades went up once more in the Bogside and Creggan . This time , Free Derry was a no @-@ go area , defended by armed members of both the Official and Provisional IRA . From within the area they launched attacks on the British Army , and the Provisionals began a bombing campaign in the city centre . As before , unarmed ' auxiliaries ' manned the barricades , and crime was dealt with by a voluntary body known as the Free Derry Police . + Support for the IRA increased further after Bloody Sunday in January 1972 , when thirteen unarmed men and boys were shot dead by the British Army 's Parachute Regiment at a march in the Bogside ( a 14th man was wounded and died 4 ½ months later ) . The support began to wane after the killing by the Official IRA of a local youth who was home on leave from the British Army . After a Provisional IRA ceasefire , during which it entered talks with the British government , broke down , the British took the decision to move against the " no @-@ go " areas . Free Derry came to an end on 31 July 1972 , when thousands of British troops moved in with armoured cars and bulldozers to occupy the area . + + = = Background = = + + Derry City lies near the border between Northern Ireland and the Republic of Ireland . It has a majority nationalist population , and nationalists won a majority of seats in the 1920 local elections . Despite this , the Ulster Unionist Party controlled the local council , Londonderry Corporation , from 1923 onwards . The Unionists maintained their majority , firstly , by manipulating the constituency boundaries ( gerrymandering ) so that the South Ward , with a nationalist majority , returned eight councillors while the much smaller North Ward and Waterside Ward , with unionist majorities , returned twelve councillors between them ; secondly , by allowing only ratepayers to vote in local elections , rather than one man , one vote , so that a higher number of nationalists , who did not own homes , were disenfranchised ; and thirdly , by denying houses to nationalists outside the South Ward constituency . The result was that there were about 2 @,@ 000 nationalist families , and practically no unionists , on the housing waiting list , and that housing in the nationalist area was crowded and of a very poor condition . The South Ward comprised the Bogside , Brandywell , Creggan , Bishop Street and Foyle Road , and it was this area that would become Free Derry . + The Derry Housing Action Committee ( DHAC ) was formed in March 1968 by members of the Derry Branch of the Northern Ireland Labour Party and the James Connolly Republican Club , including Eamonn McCann and Eamon Melaugh . It disrupted a meeting of Londonderry Corporation in March 1968 and in May blocked traffic by placing a caravan that was home to a family of four in the middle of the Lecky Road in the Bogside and staging a sit @-@ down protest at the opening of the second deck of the Craigavon Bridge . After the meeting of Londonderry Corporation was again disrupted in August , Eamon Melaugh telephoned the Northern Ireland Civil Rights Association ( NICRA ) and invited them to hold a march in Derry . The date chosen was 5 October 1968 , an adhoc committee was formed ( although in reality most of the organizing was done by McCann and Melaugh ) and the route was to take the marchers inside the city walls , where nationalists were traditionally not permitted to march . The Minister of Home Affairs , William Craig , made an order on 3 October prohibiting the march on the grounds that the Apprentice Boys of Derry were intending to hold a march on the same day . In the words of Martin Melaugh of CAIN " this particular tactic ... provided the excuse needed to ban the march . " When the marchers attempted to defy the ban on 5 October they were stopped by a Royal Ulster Constabulary ( RUC ) cordon . The police drew their batons and struck marchers , including Stormont MP Eddie McAteer and Westminster MP Gerry Fitt . Subsequently the police " broke ranks and used their batons indiscriminately on people in Duke Street " . Marchers trying to escape met another party of police and " these police also used their batons indiscriminately . " Water cannons were also used . The police action caused outrage in the nationalist area of Derry , and at a meeting four days later the Derry Citizens ' Action Committee ( DCAC ) was formed , with John Hume as chairman and Ivan Cooper as vice @-@ chairman . + + = = The first barricades = = + + Another group formed as a result of the events of 5 October was People 's Democracy , a group of students in Queen 's University Belfast . They organised a march from Belfast to Derry in support of civil rights , starting out with about forty young people on 1 January 1969 . The march met with violent opposition from loyalist counter @-@ demonstrators at several points along the route . Finally , at Burntollet Bridge , five miles outside Derry , they were attacked by a mob of about two hundred wielding clubs — some of them studded with nails — and stones . Half of the attackers were later identified from press photographs as members of the B @-@ Specials . The police , who were at the scene , chatted to the B @-@ Specials as they prepared their ambush , and then failed to protect the marchers , many of whom ran into the river and were pelted with stones from the bank . Dozens of marchers were taken to hospital . The remainder continued on to Derry where they were attacked once more on their way to Craigavon Bridge before they finally reached Guildhall Square , where they held a rally . Rioting broke out after the rally . Police drove rioters into the Bogside , but did not come after them . In the early hours of the following morning , 5 January , members of the RUC charged into St. Columb 's Wells and Lecky Road in the Bogside , breaking windows and beating residents . In his report on the disturbances , Lord Cameron remarked that " for such conduct among members of a disciplined and well @-@ led force there can be no acceptable justification or excuse " and added that " its effect in rousing passions and inspiring hostility towards the police was regrettably great . " + That afternoon over 1 @,@ 500 Bogside residents built barricades , armed themselves with steel bars , wooden clubs and hurleys , and told the police that they would not be allowed into the area . DCAC chairman John Hume told a meeting of residents that they were to defend the area and no @-@ one was to come in . Groups of men wearing armbands patrolled the streets in shifts . John ' Caker ' Casey , a local activist , painted " You are now entering Free Derry " in white paint on the gable wall of a house on the corner of Lecky Road and Fahan Street . That corner , which was a popular venue for meetings , later became known as " Free Derry Corner " . On 7 January , the barricaded area was extended to include the Creggan , another nationalist area on a hill overlooking the Bogside . A clandestine radio station calling itself " Radio Free Derry " began broadcasting to residents , playing rebel songs and encouraging resistance . On a small number of occasions law @-@ breakers attempted crimes , but were dealt with by the patrols . Despite all this , the Irish Times reported that " the infrastructure of revolutionary control in the area has not been developed beyond the maintenance of patrols . " Following some acts of destruction and of violence late in the week , members of the DCAC including Ivan Cooper addressed residents on Friday , 10 January and called on them to dismantle the barricades . The barricades were taken down the following morning . + + = = April 1969 = = + + Over the next three months there were violent clashes , with local youths throwing stones at police . Violence came to a head on Saturday , 19 April after a planned march from Burntollet Bridge to the city centre was banned . A protest in the city centre led to clashes with " Paisleyites " — unionists in sympathy with the anti @-@ civil rights stance of Ian Paisley . Police attempting to drive the protesters back into the Bogside were themselves driven back to their barracks . A series of pitched battles followed , and barricades were built , often under the supervision of Bernadette Devlin , newly elected MP for Mid Ulster . Police pursuing rioters broke into a house in William Street and severely beat the occupant , Samuel Devenny , his family and two friends . Devenny was brought to hospital " bleeding profusely from a number of head wounds . " At midnight four hundred RUC men in full riot gear and carrying riot shields occupied the Bogside . Convoys of police vehicles drove through the area with headlights blazing . + The following day , several thousand residents , led by the DCAC , withdrew to the Creggan and issued an ultimatum to the RUC — withdraw within two hours or be driven out . With fifteen minutes of the two hours remaining , the police marched out through the Butcher 's Gate , even as the residents were entering from the far side . The barricades were not maintained on this occasion , and routine patrols were not prevented . + Samuel Devenny suffered a heart attack four days after his beating . On 17 July he suffered a further heart attack and died . Thousands attended his funeral , and the mood was sufficiently angry that it was clear the annual Apprentice Boys ' parade , scheduled for 12 August , could not take place without causing serious disturbance . + + = = August – October 1969 = = + + The Apprentice Boys ' parade is an annual celebration by unionists of the relief of the Siege of Derry in 1689 , which began when thirteen young apprentice boys shut the city 's gates against the army of King James . At that time the parade was held on 12 August each year . Participants from across Northern Ireland and Britain marched along the city walls above the Bogside , and were often openly hostile to the residents . On 30 July 1969 the Derry Citizens Defence Association ( DCDA ) was formed to try to preserve peace during the period of the parade , and to defend the Bogside and Creggan in the event of an attack . The chairman was Seán Keenan , an Irish Republican Army ( IRA ) veteran ; the vice @-@ chairman was Paddy Doherty , a popular local man sometimes known as " Paddy Bogside " and the secretary was Johnnie White , another leading republican and leader of the James Connolly Republican Club . Street committees were formed under the overall command of the DCDA and barricades were built on the night of 11 August . The parade took place as planned on 12 August . As it passed through Waterloo Place , on the edge of the Bogside , hostilities began between supporters and opponents of the parade . Fighting between the two groups continued for two hours , then the police joined in . They charged up William Street against the Bogsiders , followed by the ' Paisleyites ' . They were met with a hail of stones and petrol bombs . The ensuing battle became known as the Battle of the Bogside . Late in the evening , having been driven back repeatedly , the police fired canisters of CS gas into the crowd . Youths on the roof of a high @-@ rise block of flats on Rossville Street threw petrol bombs down on the police . Walkie @-@ talkies were used to maintain contact between different areas of fighting and DCDA headquarters in Paddy Doherty 's house in Westland Street , and first aid stations were operating , staffed by doctors , nurses and volunteers . Women and girls made milk @-@ bottle crates of petrol bombs for supply to the youths in the front line and " Radio Free Derry " broadcast to the fighters and their families . On the third day of fighting , 14 August , the Northern Ireland Government mobilised the Ulster Special Constabulary ( B @-@ Specials ) , a force greatly feared by nationalists in Derry and elsewhere . Before they engaged , however , British troops were deployed at the scene , carrying automatic rifles and sub @-@ machine guns . The RUC and B @-@ Specials withdrew , and the troops took up positions outside the barricaded area . + A deputation that included Eamonn McCann met senior army officers and told them that the army would not be allowed in until certain demands were met , including the disarming of the RUC , the disbandment of the B @-@ Specials and the abolition of Stormont ( the Parliament and Government of Northern Ireland ) . The officers agreed that neither troops nor police would enter the Bogside and Creggan districts . A ' peace corps ' was formed to maintain law and order . When the British Home Secretary , Jim Callaghan , visited Northern Ireland and announced his intention to visit the Bogside on 28 August , he was told that he would not be allowed to bring either police or soldiers with him . Callaghan agreed . Accompanied by members of the Defence Committee , he was " swept along by a surging crowd of thousands " up Rossvile Street and into Lecky Road , where he " took refuge " in a local house , and later addressed crowds from an upstairs window . In preparation for Callaghan 's visit the " Free Derry " wall was painted white and the " You are now entering Free Derry " sign was professionally re @-@ painted in black lettering . + Following Callaghan 's visit , some barricades were breached , but the majority remained while the people awaited concrete evidence of reform . Still the army made no move to enter the area . Law and order was maintained by a ' peace corps ' — volunteers organised by the DCDA to patrol the streets and man the barricades . There was very little crime . Punishment , in the words of Eamonn McCann , " as often as not consisted of a stern lecture from Seán Keenan on the need for solidarity within the area . " In September the barricades were replaced with a white line painted on the road . + The Hunt Report on the future of policing in Northern Ireland was presented to the Stormont cabinet in early October . Jim Callaghan held talks with the cabinet in Belfast on 10 October , following which the report 's recommendations were accepted and made public . They included the recommendation that the RUC should be ' ordinarily ' unarmed , and that the B @-@ Specials should be phased out and replaced by a new force . The new RUC Chief Constable , Arthur Young , an Englishman , was announced , and travelled to Belfast with Callaghan . The same day , Seán Keenan announced that the DCDA was to be dissolved . On 11 October Callaghan and Young visited Free Derry , and on 12 October the first military police entered the Bogside , on foot and unarmed . + + = = IRA resurgence = = + + The Irish Republican Army ( IRA ) had been inactive militarily since the end of the Border Campaign in 1962 . It was low in both personnel and equipment — Chief of Staff Cathal Goulding told Seán Keenan and Paddy Doherty in August 1969 that he " couldn 't defend the Bogside . I haven 't the men nor the guns to do it . " During the 1960s the leadership of the republican movement had moved to the left . Its focus was on class struggle and its aim was to unite the Irish nationalist and unionist working classes in order to overthrow capitalism , both British and Irish . Republican Clubs were formed in Northern Ireland , where Sinn Féin was proscribed . These clubs were involved in the formation of NICRA in 1967 . In Derry , the James Connolly Republican Club worked closely with Labour Party radicals , with whom they set up the Derry Housing Action Committee and Derry Unemployed Action Committee . The Derry Citizens ' Defence Association was formed initially by republicans , who then invited other nationalists to join . Although there were tensions between the younger leaders like Johnnie White and the older , traditional republicans such as Seán Keenan , both sides saw the unrest of 1968 @-@ 69 as a chance to advance republican aims , and the two shared the platform at the Easter Rising commemoration in April 1969 . + The events of August 1969 in Derry , and more particularly in Belfast where the IRA was unable to prevent loss of life or protect families burned out of their homes , brought to a head the divisions that had already appeared within the movement between the radicals and the traditionalists , and led to a split in December 1969 into the Official IRA and the Provisional IRA . Initially , both armies organised for defensive purposes only , although the Provisionals were planning towards an offensive campaign . In Derry there was far less hostility between the two organisations than elsewhere and householders commonly paid subscriptions to both . When rioters were arrested after the Official 's Easter parade in March 1970 , Officials and Provisionals picketed their trial together . At the start the Officials attracted most of the younger members . Martin McGuinness , who in August 1969 had helped defend the barricades , initially joined the Officials , but a few months later left to join the Provisionals . + Relations between the British Army and the residents had steadily decayed since the first appearance of troops in August 1969 . In September , after clashes between nationalist and unionist crowds that led to the death of a Protestant man , William King , the British Army erected a ' peace ring ' to enclose the nationalist population in the area they had previously controlled . Roads into the city centre were closed at night and people were prevented from walking on certain streets . Although some moderate nationalists accepted this as necessary , there was anger among young people . Clashes between youths and troops became more frequent . The riot following the Officials ' Easter parade in March 1970 marked the first time that the army used ' snatch squads ' , who rushed into the Bogside wielding batons to make arrests . The snatch squads soon became a common feature of army arrest operations . There was also a belief that they were arresting people at random , sometimes days after the alleged offence , and based on the identification of people that they had seen from a considerable distance . The rioters were condemned as hooligans by moderates , who saw the riots as hampering attempts to resolve the situation . The Labour radicals and Official republicans , still working together , tried to turn the youth away from rioting and create socialist organizations — one such organization was named the Young Hooligans Association — but to no avail . The Provisionals , while disapproving of riots , viewed them as the inevitable consequence of British occupation . This philosophy was more attractive to rioters , and some of them joined the Provisional IRA . The deaths of two leading Provisionals in a premature explosion in June 1970 resulted in young militants becoming more prominent in the organization . Nevertheless , up to July 1971 the Provisional IRA remained numerically small . + Two men , Séamas Cusack and Desmond Beattie , were shot dead in separate incidents in the early morning and afternoon of 8 July 1971 . They were the first people to be killed by the British Army in Derry . In both cases the British Army claimed that the men were attacking them with guns or bombs , while eyewitnesses insisted that both were unarmed . The Social Democratic and Labour Party ( SDLP ) , the newly formed party of which John Hume and Ivan Cooper were leading members , withdrew from Stormont in protest , but among residents there was a perception that moderate policies had failed . The result was a surge of support for the IRA . The Provisionals held a meeting the following Sunday at which they called on people to " join the IRA " . Following the meeting , people queued up to join , and there was large @-@ scale rioting . The British Army post at Bligh 's Lane came under sustained attack , and troops there and around the city came under fire from the IRA . + + = = Internment and the third Free Derry = = + + The increasing violence in Derry and elsewhere led to increasing speculation that internment without trial would be introduced in Northern Ireland , and on 9 August 1971 hundreds of republicans and nationalists were arrested in dawn raids . In Derry , residents came out onto the streets to resist the arrests , and fewer people were taken there than elsewhere ; nevertheless leading figures including Seán Keenan and Johnnie White were interned . In response , barricades were erected once again and the third Free Derry came into existence . Unlike its predecessors , this Free Derry was marked by a strong IRA presence , both Official and Provisional . It was defended by armed paramilitaries — a no @-@ go area , one in which British security forces were unable to operate . + Gun attacks on the British Army increased . Six soldiers were wounded in the first day after internment , and shortly afterwards a soldier was killed — the first to be killed by either IRA in Derry . The army moved in in force on 18 August to dismantle the barricades . A gun battle ensued in which a young Provisional IRA officer , Eamonn Lafferty , was killed . A crowd staging a sit @-@ down protest was hosed down and the protesters , including John Hume and Ivan Cooper , arrested . With barricades re @-@ appearing as quickly as they were removed , the army eventually abandoned their attempt . + The Derry Provisionals had little contact with the IRA elsewhere . They had few weapons ( about twenty ) which they used mainly for sniping . At the same time , they launched their bombing campaign in Derry . Unlike in Belfast , they were careful to avoid killing or injuring innocent people . Eamonn McCann wrote that " the Derry Provos , under Martin McGuinness , had managed to bomb the city centre until it looked as if it had been hit from the air without causing any civilian casualties . " + Although both IRAs operated openly , neither was in control of Free Derry . The barricades were manned by unarmed ' auxiliaries ' . Crime was dealt with by a volunteer force called the Free Derry Police , which was headed by Tony O 'Doherty , a Derry footballer and Northern Ireland International . + + = = Bloody Sunday = = + + An anti @-@ internment protest organised by the Northern Ireland Civil Rights Association ( NICRA ) at Magilligan Camp in January 1972 was met with violence from the 1st Battalion , The Parachute Regiment ( 1 Para ) . NICRA had organised a march from the Creggan to Derry city centre , in defiance of a ban , on the following Sunday , 30 January 1972 . Both IRAs were asked , and agreed , to suspend operations on that day to ensure the march passed off peacefully . The British Army erected barricades around the Free Derry area to prevent marchers from reaching the city centre . On the day , march organisers turned the march away from the barriers and up to Free Derry Corner , but some youths proceeded to the barrier at William Street and stoned soldiers . Troops from 1 Para then moved into Free Derry and opened fire , killing thirteen people , all of whom were subsequently found to be unarmed . A fourteenth shooting victim died four months later in June 1972 . Like the killing of Cusack and Beattie the previous year , Bloody Sunday had the effect of hugely increasing recruitment to the IRA , even among people who previously would have been ' moderates ' . + + = = February - July 1972 = = + + Both the Provisional and Official IRA stepped up attacks after Bloody Sunday , with the tacit support of the residents . Local feelings changed , however , with the killing of Ranger William Best by the Official IRA . Best was a 19 @-@ year @-@ old local man who was home on leave from the British Army at his parents ' house in the Creggan . He was abducted , interrogated and shot . The following day 500 women marched to the Republican Club offices in protest . Nine days later , on 29 May , the Official IRA declared a ceasefire . The Provisional IRA initially stated that they would not follow suit , but after informal approaches to the British Government they announced a ceasefire from 26 June . Martin McGuinness was the Derry representative in a party of senior Provisionals who travelled to London for talks with William Whitelaw , the Secretary of State for Northern Ireland . The talks were not resumed after the ending of the truce following a violent confrontation in Belfast when troops prevented Catholic families from taking over houses in the Lenadoon estate . + Political pressure for the action against the " no @-@ go " areas increased after the events of Bloody Friday in Belfast . A British Army attack was considered inevitable , and the IRA took the decision not to resist it . On 31 July 1972 , Operation Motorman was launched when thousands of British troops , equipped with armoured cars and armoured bulldozers ( AVREs ) , dismantled the barricades and occupied the area . + + = = Subsequent history = = + + After Operation Motorman , the British Army controlled the Bogside and Creggan by stationing large numbers of troops within the area , by conducting large @-@ scale ' search ' operations that were in fact undertaken for purposes of intelligence gathering , and by setting up over a dozen covert observation posts . Over the following years IRA violence in the city was contained to the point where it was possible to believe ' the war was over ' in the area , although there were still frequent street riots . Nationalists — even those who did not support the IRA — remained bitterly opposed to the army and to the state . + Many of the residents ' original grievances were addressed with the passing of the Local Government ( Northern Ireland ) Act , 1972 , which redrew the electoral boundaries and introduced universal adult suffrage based on the single transferable vote . Elections were held in May 1973 . Nationalists gained a majority on the council for the first time since 1923 . Since then the area has been extensively redeveloped , with modern housing replacing the old houses and flats . The Free Derry era is commemorated by the Free Derry wall , the murals of the Bogside Artists and the Museum of Free Derry . + + + = Come What ( ever ) May = + + Come What ( ever ) May is the second studio album by American alternative metal band Stone Sour . It was recorded and produced by the band and Nick Raskulinecz at Studio 606 in Los Angeles , California , and was released on August 1 , 2006 , through Roadrunner Records . Writing for the album began as early as 2003 when vocalist Corey Taylor and guitarist James Root were writing material for their other band , Slipknot . In January 2006 Stone Sour began recording the follow @-@ up to their 2002 debut album Stone Sour , during which time drummer Joel Ekman left the band due to family constraints . He was eventually replaced by ex @-@ Soulfly drummer Roy Mayorga who played on all but two tracks on the album . + Following the release of the album , Stone Sour went on to promote it for over a year ; releasing five singles and touring in several regions , including the United States , Canada , Japan and several countries in Europe . The album received generally positive reviews . It was praised for showing a progression in the band 's song writing ability and musical style . It was also certified Gold in the United States and Canada and the single " 30 / 30 @-@ 150 " was nominated for Best Metal Performance at the 49th Grammy Awards . On June 26 , 2007 Stone Sour released a special edition version of the album , it included six previously unreleased tracks and a bonus DVD which featured three music videos and a complete live performance of the band in Moscow . It remains their best @-@ selling album to date , mostly due to the success of the single " Through Glass . " + + = = Production = = + + In September 2005 , lead singer Corey Taylor announced that Stone Sour would return with a second album . He said that they had written over 30 songs , some during the writing process of Vol . 3 : ( The Subliminal Verses ) , the third album by vocalist Taylor and guitarist James Root 's other band Slipknot , and that they were working on demoing the tracks before entering the studio . Dave Fortman was originally slated to produce the album , however , on January 22 , 2006 Stone Sour began working on the album with producer Nick Raskulinecz at Dave Grohl 's personal studio ( Studio 606 ) , in Los Angeles . Time in the studio began with a week of pre @-@ production , during which guitarist Josh Rand says producer Raskulinecz " pushed [ the band ] to the brink and back " to help fine @-@ tune the songs they had previously written . Though Rand and Taylor wrote most of the music and lyrics for the first album , respectively , writing for Come What ( ever ) May was done by all members . + Following this , the band set out to record 18 tracks and work began on recording Joel Ekman 's drum tracks . However , Ekman was forced to leave the studio after four weeks due to his young son 's diagnosis of a brainstem glioma . With the fate of the album in jeopardy , Stone Sour recruited ex @-@ Soulfly member Roy Mayorga as a session drummer . Mayorga recorded drums for all but two tracks on the album , Godsmack drummer Shannon Larkin performed on the track " 30 / 30 @-@ 150 " and guitarist Root performed drums on the bonus track " The Day I Let Go . " In an interview with Revolver during the recording process vocalist Taylor talked about the differences between this album and their previous album , Stone Sour . He said that pressures from fans and the record label were much larger ; also noting that he " thrives on the pressure , because it gets [ him ] going . " While promising that " the album 's gonna be miles above the first one , " Taylor explained that it is " more melodic and darker " . In late March 2006 , drummer Joel Ekman officially left Stone Sour and the band was talking with a few drummers who could replace him . On April 7 , 2006 the recording sessions for Come What ( ever ) May concluded . A month later session drummer Roy Mayorga joined Stone Sour on a full @-@ time basis . + + = = Promotion = = + + It was announced in March 2006 that Stone Sour 's second album , which was tentatively titled " Come What May , " would be released on July 18 , 2006 . However , the release date for the album was pushed back until August 22 . Due to the delay Stone Sour released a music video for the track " Reborn " , which featured footage of the band working on the album in the studio . The cover artwork from the album was released online on May 20 , 2006 . Shortly after , it was confirmed by a representative from the band 's record label Roadrunner that the release date had been brought forward , and the official release date would be August 1 , 2006 . On July 31 , 2006 , the day before its release the album was made available online for streaming in its entirety through AOL . + On May 22 , 2006 the first single from the album , " 30 / 30 @-@ 150 " , was made available online as a free MP3 download . A music video for the single was shot with director P.R. Brown in Los Angeles , the video received a premier on MTV 's Headbangers Ball on June 3 , 2006 . Prior to the release of the second single from the album , " Through Glass " , radio stations throughout the US showed high support for the song . A music video for the single was shot with director Tony Petrossian and was released on June 9 , 2006 online through Yahoo ! . The third single from the album , " Sillyworld " , began receiving radio airplay in November 2006 . A music video for the single was shot in January 2007 and was released online on March 8 , 2007 . The fourth single from the album , " Made of Scars " , featured a music video which was recorded live on April 7 , 2007 and was posted online on June 5 , 2007 . The fifth and final single from the album , " Zzyzx Rd . " , started receiving radio airplay in Fall 2007 and no music video was made for the single . + The band began touring in support of the album prior to its release , initiating touring with several free shows in the US . Followed by multiple appearances at festivals in Europe . They then joined Korn for their 2006 edition of Family Values Tour across the US , which featured 33 dates across 3 months . On August 8 , 2006 Stone Sour made a special guest appearance on The Tonight Show with Jay Leno to promote and perform their second single " Through Glass . " They also performed at the Japanese festival Summer Sonic midway through the Family Values Tour . Then through November and December 2006 , Stone Sour joined Disturbed for their Music as a Weapon Tour . In January 2007 Stone Sour joined Evanescence for a Canadian tour , followed by a headlining tour of Europe . They then headlined the Spring 2007 Jägermeister Music Tour across the US , followed by headlining tours in Australia and Japan . They then started a tour in Europe playing festivals and select headline shows . They wrapped up touring in support of the album with a headlining tour in the US through August and September in 2007 . + + = = = Special edition = = = + + On June 26 , 2007 , Stone Sour released a special edition version of the album with six previously unreleased tracks and a bonus DVD . The DVD featured a full concert performance by the band from October 2006 in Moscow and the music videos for " 30 / 30 @-@ 150 , " " Through Glass , " and " Sillyworld . " When talking about the special edition , vocalist Taylor said , " we really wanted to do something which was really cool , " saying that this shows the band 's different musical elements and them in their live element , which he says " people really gravitate towards . " In addition to this , Stone Sour released a live album of their concert in Moscow exclusively on iTunes , entitled Live in Moscow . + + = = Musical style = = + + In an interview with MTV in 2006 , vocalist Corey Taylor said that Come What ( ever ) May was a return to the roots of the band , stating it is " a lot more from the spirit of what the band started with in 1992 . " Noting how some songs were " very atmospheric , " while others maintained " the hard rock and the heavy stuff . " Jon Wiederhorn of MTV said that " for every thrash riff there 's a tunefully grungy passage , for every flailing guitar line there 's a rock @-@ radio hook . " When talking about the track " 30 / 30 @-@ 150 " , he said parts are " bludgeoning , barbed and heavy , " while others are " soaring and triumphant , " with the production of Raskulinecz helping balance the album 's heaviness with its radio @-@ accessibility . Come What ( ever ) May 's lyrics include themes of " pain , pleasure , happiness , and grief . " The diversity in subjects is evident throughout the album , songs including " Come What ( ever ) May " were politically influenced while the track " Socio " is about " social anxiety attacks " that vocalist Taylor suffered . " Zzyzx Rd " is a love song written to Taylor 's wife for helping him in his struggles against alcoholism and contemplation of suicide . " I 've never written anything like that before , but it was very important for me to tell the world not only how much she saved me , but how much she means to me , " said Taylor . Taylor said there is a common thread with the lyrics throughout the album , saying that they are " about never forgetting where you came from , who you are and why you do this . " + + = = Reception = = + + Come What ( ever ) May was met with generally positive critical reviews . Several reviewers noted on how it helped to further establish Stone Sour . Chad Bower of About.com stated that the band had " progressed a lot since their debut " , noting that the album was " very diverse and [ allows ] the band to show many different sides of their musical personality . " Megan Frye of Allmusic opens her review of the album by distinguishing what sets Stone Sour apart musically , stating " [ it 's their ] ability to create smooth , radio @-@ friendly alternative metal songs while simultaneously not boring the people who have heard way too much from post @-@ grunge groups . " On a similar note , Michael Melchor of 411mania said " the band is much better at the craft of songwriting than many of their peers . " However , in contrast , reviewer William Fry of IGN criticized the album , saying " Stone Sour doesn 't do anything inspired , original , or fresh here " even calling the album " completely misdirected , and stonewalled . " A particular point of interest for reviewers was how Come What ( ever ) May is more melodic than their previous album Stone Sour . Melchor of 411mania said the album is " much more liberal with the balladry and acoustic sounds than its predecessor , " noting on the track " Sillyworld " he said " it sounds like what Nickelback could be if Chad Kroeger could write a good melody " . In his review , Chad Bower labeled Come What ( ever ) May as a " very melodic and accessible album " stating that " it has a little something for everyone . " Similarly , Megan Frye triumphed the album as an " unyielding effort from a promising talent " . + Come What ( ever ) May sold over 80 @,@ 000 copies in its first week and debuted at the fourth spot on the Billboard 200 in the United States , and went on to be certified gold in the UK , Canada and the United States . In 2007 , the single " 30 / 30 @-@ 150 " was nominated for Best Metal Performance at the 49th Grammy Awards . + + = = Track listing = = + + All lyrics written by Corey Taylor , all music composed by Stone Sour . + On the iTunes deluxe version , the pop version of " Zzyzx Rd . " replaced the original version as the 12th track . + + = = = Special edition DVD = = = + + + = = Personnel = = + + + = = Chart positions = = + + + + = Chad at the 2008 Summer Olympics = + + Chad sent a delegation of two athletes to compete at the 2008 Summer Olympics in Beijing , China : Moumi Sébergué , who competed in the men 's 100 meters , and Hinikissia Albertine Ndikert , who competed in the women 's 100 meters and also bore the Chadian flag during ceremonies . The appearance of this delegation marked the tenth appearance of Chad at the Summer Olympics , the first been in 1964 Summer Olympics in Tokyo , Japan , and its seventh appearance since its Olympic hiatus between 1976 and 1980 . Both Sébergué and Ndikert ranked seventh in their respective heats and did not advance past the qualification round . As of the end of the 2012 London Olympics , there have been no medalists from Chad . + + = = Background = = + + Chad is a landlocked country in Africa whose northern region lies within the eastern reaches of the Sahara Desert and whose southern region lies within the eastern portion of the Sahel . It borders Libya to the south , Niger to the east , Sudan to the west , and the Central African Republic to the north . Chad was originally part of French West Africa until 1960 , when it declared independence . Some four years later , the former French colony made its début at the 1964 Summer Olympics in Tokyo , Japan . For the next three decades , Chad became embroiled in civil war and experienced invasions by Libya and upheavals by Sudanese @-@ backed rebels ; the civil war ended in 1990 , although rebel threats had persisted between then and 2008 . During Chad 's greatest era of instability , athletes from the country did not attend the 1976 Summer Olympics in Montréal , Canada or the 1980 Summer Olympics in Moscow , USSR , although delegations were sent to all other games between 1964 and 2008 . + The largest Chadian delegation to reach the Olympics appeared in the 1988 Summer Olympics in Seoul , South Korea and at the 1992 Summer Olympics in Barcelona , Spain ; each time , Chad 's National Olympic Committee sent six athletes . During the 1992 games , the NOC sent the nation 's first female Olympian . Since then ( and up to the Beijing games ) , at least one woman has been a part of the Chadian delegation . The smallest contingency of Chadian Olympians occurred during the 2004 Summer Olympics in Athens , Greece , when only Kaltouma Nadjina competed on the country 's behalf . The delegation that arrived in Beijing consisted of two athletes — one man ( 30 @-@ year @-@ old Moumi Sébergué ) and one woman ( 15 @-@ year @-@ old Hinikissia Albertine Ndikert ) , both participants in track events . Ndikert was Chad 's flagbearer at the ceremonies . Up to and including the Beijing games , there has yet to have been a medalist from Chad . + + = = Athletics = = + + Competitors in athletics events could qualify for the next round of competition in two ways . Qualifying by right was posting a high result in their own heat , and qualifying by result was posting a high result in overall standings . Ranks shown are thus those within each heat , not in overall standings . + Moumi Sébergué represented Chad at the Beijing Olympics in the men 's 100 meters dash . Born in 1977 , Sébergué first participated in the Olympics at age 22 when he raced in the men 's 100 meters at the 2000 Summer Olympics in Sydney , Australia , placing seventh in his qualification heat and not progressing to later rounds . He did not attend the 2004 Summer Olympics in Athens , Greece , but returned to the Olympics at Beijing at the age of 30 . During the course of the August 14 , 2008 races in his event , when the qualification round took place , Sébergué competed in the tenth heat against seven other athletes . He finished the race in 11 @.@ 14 seconds , placing seventh in the heat ahead of Tuvalu 's Okinali Tinilau ( 11 @.@ 48 seconds ) and behind Gabon 's Wilfried Bingangoye ( 10 @.@ 87 seconds ) in a heat led by the Netherlands Antilles ' Churandy Martina ( 10 @.@ 35 seconds ) and Japan 's Naoki Tsukahara ( 10 @.@ 39 seconds ) . Of the 80 athletes who participated in the events , the Chadian sprinter ranked 70th . He did not advance to later rounds . + Hinikissia Albertine Ndikert competed on Chad 's behalf as the national delegation 's only female athlete at the Beijing games . She participated in the women 's 100 meters dash , and was 15 years old at the time of the competition . Ndikert had not previously competed in any Olympic games . During the qualification round of the event , which took place on August 15 , 2008 , Ndikert competed in the eighth heat against seven other athletes . She finished the race in 12 @.@ 55 seconds , placing seventh ; she defeated the Democratic Republic of the Congo 's Franka Magali ( 12 @.@ 57 seconds ) and fell behind Papua New Guinea 's Mae Koime ( 11 @.@ 68 seconds ) in a heat led by Nigeria 's Damola Osayomi ( 11 @.@ 13 seconds ) and the Bahamas ' Debbie Ferguson @-@ McKenzie ( 11 @.@ 17 seconds ) . Of the event 's 85 competitors , Ndikert finished in 64th place . Therefore , Ndikert did not advance to round two and beyond . + Key + Note – Ranks given for track events are within the athlete 's heat only + Q + = Qualified for the next round + q = + Qualified for the next round as a fastest loser or , in field events , by position without achieving the qualifying target + NR + = National record + N / A = + Round not applicable for the event + Bye = Athlete not required to compete in round + + + = View of the World from 9th Avenue = + + View of the World from 9th Avenue ( sometimes A Parochial New Yorker 's View of the World , A New Yorker 's View of the World or simply View of the World ) is a 1976 illustration by Saul Steinberg that served as the cover of the March 29 , 1976 , edition of The New Yorker . The work presents the view from Manhattan of the rest of the world showing Manhattan as the center of the world . + View of the World has been parodied by Ted Rall , Columbia Pictures , The New Yorker , The Economist and Mad Magazine , among others . The work has been imitated and printed without authorization in a variety of ways . The Columbia parody led to a ruling by the United States District Court for the Southern District of New York in Steinberg v. Columbia Pictures Industries , Inc. in favor of Steinberg because of copyright violations by Columbia Pictures . The work is regarded as one of the greatest magazine covers of recent generations and is studied by art students around the world . + + = = Background = = + + Saul Steinberg created 85 covers and 642 internal drawings and illustrations for The New Yorker , including its March 29 , 1976 cover , titled " View of the World from 9th Avenue " . This is regarded as his most famous work . It is considered an example of unintentional fame : Steinberg has noted that the type of fame that resulted from the work has diminished his significance to " the man who did that poster " . The work is sometimes referred to as A Parochial New Yorker 's View of the World or A New Yorker 's View of the World because it depicts a map of the world as seen by self @-@ absorbed New Yorkers . At one point The New Yorker applied for a copyright from the United States Copyright Office for the work . It assigned the copyright to Steinberg and subsequently reproduced posters of the painting . + + = = Detail = = + + The illustration is split in two parts , with the bottom half of the image showing Manhattan 's 9th Avenue , 10th Avenue , and the Hudson River ( appropriately labeled ) , and the top half depicting the rest of the world . It is a westward view over 10th Avenue . The rest of the United States is the size of the three New York City blocks and is drawn as a rectangle bounded by North American neighbors Canada and Mexico , with a thin brown strip along the Hudson representing " Jersey " , the names of five cities ( Los Angeles ; Washington , D.C. ; Las Vegas ; Kansas City ; and Chicago ) and three states ( Texas , Utah , and Nebraska ) scattered among a few rocks for the United States beyond New Jersey , which is in bolder font than the rest of the country beyond the Hudson . Washington , D.C. is depicted as a remote location near Mexico . The Pacific Ocean , slightly wider than the Hudson , separates the United States from three flattened land masses labeled China , Japan and Russia . Notably , the image depicts the world with a back turned to Europe , which is absent from the painting . + The work is composed in ink , pencil , colored pencil , and watercolor on paper and measures 28 by 19 inches ( 71 cm × 48 cm ) . When exhibiting this work along with alternate versions and sketches , the University of Pennsylvania summarized the work as a " bird 's @-@ eye view of the city from Ninth Avenue in a straight line westward , with space becoming ever more condensed ... " They also described the work as a tongue @-@ in @-@ cheek view of the world . New York interpreted the New York @-@ centric mind 's view of the rest of the world as a set of outer boroughs as iconic . National Post journalist Robert Fulford described the perspective as one in which the entire world is a suburb of Manhattan . + + = = Parodies = = + + View of the World has been imitated without authorization in a variety of ways . The work has been imitated in postcard format by numerous municipalities , states and nations . Steinberg had stated that he could have retired on royalties from the many parodies made of the painting , had they been paid , a motivation for his eventual copyright lawsuit for the Moscow on the Hudson use . Fulford , writing in The National Post , noted that the metaphor of the world as a suburb of Manhattan was " understood and borrowed " by the whole world . Local artists , especially poster artists , presented similarly compelling depictions of their own provincial perceptions . Fulford demonstrated the prominence of this work by mentioning that a high school in suburban Ottawa made imitating View of the World an assignment in its graphic arts class . He also noted that the results of this assignment was a worldwide variety of global foci from which the students viewed the world . + The illustration — humorously depicting New Yorkers ' self @-@ image of their place in the world , or perhaps outsiders ' view of New Yorkers ' self @-@ image — inspired many similar works , including the poster for the 1984 film Moscow on the Hudson ; that movie poster led to a lawsuit , Steinberg v. Columbia Pictures Industries , Inc . , 663 F. Supp . 706 ( S.D.N.Y. 1987 ) , which held that Columbia Pictures violated the copyright Steinberg held on his work . + On June 5 , 2003 , during the first term of George W. Bush 's presidency , Ted Rall presented A View of the World from Pennsylvania Avenue as a Parody of View of the World from 9th Avenue in a Universal Press Syndicate editorial cartoon . He replaced the letters representing The New Yorker with The Bushie . + The cover was later satirized by Barry Blitt for the cover of The New Yorker on October 6 , 2008 . The cover featured Sarah Palin looking out of her window seeing only Alaska , with Russia in the far background . + The March 21 , 2009 The Economist included a story entitled " How China sees the World " that presents a parody that is also an homage to the original image , but depicting the viewpoint from Beijing 's Chang 'an Avenue instead of Manhattan . A caption above the illustration reads " Illustration by Jon Berkeley ( with apologies to steinberg and The New Yorker ) " . It accompanied an article that discussed the burgeoning Chinese economy at the time of the contemporary financial crisis . + The October 1 , 2012 cover of Mad Magazine satirized the problems with the September release of Apple Inc . ' s iOS 6 mobile operating system which included Apple Maps , a replacement for Google Maps . The work presents what View of the World might look like if one had relied upon the September 2012 version of Apple Maps to locate various landmarks . + + = = Critical review = = + + On October 17 , 2005 , American Society of Magazine Editors unveiled its list of the top 40 magazine covers of the prior 40 years and ranked View of the World from 9th Avenue in fourth place . The listing stated that the work " ... has come to represent Manhattan 's telescoped perception of the country beyond the Hudson River . The cartoon showed the supposed limited mental geography of Manhattanites . " + + + = Bintulu = + + Bintulu / biːnˈtuːluː / ( Chinese : 民都魯 ; pinyin : Míndūlǔ ; Pe ̍ h @-@ ōe @-@ jī : Bîn @-@ to ͘ -ló ͘ ) is a coastal town on the island of Borneo in the central region of Sarawak , Malaysia . Bintulu is located 610 kilometres ( 380 mi ) northeast of Kuching , 216 kilometres ( 134 mi ) northeast of Sibu , and 200 kilometres ( 120 mi ) southwest of Miri . With a population of 114 @,@ 058 as of 2010 , Bintulu is the capital of the Bintulu District of the Bintulu Division of Sarawak , Malaysia . + The name of Bintulu was derived from the local native language " Mentu Ulau " ( picking heads ) . Bintulu was a small fishing village when Rajah James Brooke acquired it in 1861 . Brooke later built a fort there in 1862 . In 1867 , the first General Council meeting ( now Sarawak State Legislative Assembly ) was convened in Bintulu . It is the earliest state legislature system in Malaysia . The construction of the earliest airstrip in Bintulu began in 1934 but was halted in 1938 due to financial difficulties . During World War II , the airstrip was heavily bombed by Allied forces . The British later rebuilt the airstrip , and it became fully operational in 1955 . The old airport was replaced by a new airport in 2002 . Bintulu remained a fishing village until 1969 when oil and gas reserves were discovered off the coast . Since then , Bintulu has become the centre of energy intensive industries such as a Malaysia LNG plant , a Shell Middle Distillate Synthesis plant , and a Bintulu combined cycle power plant . The economy has also expanded into oil palm and forest plantations , palm oil processing , wood @-@ waste processing , and cement manufacturing . The port of Bintulu is the busiest in Sarawak . The town is also a gateway to Samalajau Industrial Park . + Among the tourist attractions in Bintulu are Similajau National Park , Tumbina Park , Tanjung Batu beach , Jepak village , Kuan Yin Tong temple , Assyakirin mosque , Council Negri monument , Tamu Bintulu , and Pasar Utama markets . The Borneo International Kite Festival is held annually in the town . + + = = Etymology = = + + During the 16th century , Bintulu was named " River de Burulu " by Portuguese cartographers . There are several legends surrounding the name Bintulu . During the Brooke dynasty , the indigenous Iban people practised headhunting to maintain their social status in the community . They threw the heads into the Kemena River , after which the heads had to be collected from the river . The practice of collecting the heads was known as " Mentu Ulau " ( picking heads ) in the local native language . Another story relates that two Iban warriors named Bernik and Jelab built houses along the river . They and their followers frequently carried out preservation of severed heads near a small river stream branching off from Sebezaw River because the river bank was flat and wide . Therefore , the small river stream was named " Mentu Ulau " river . Outsiders who came to Bintulu subsequently pronounced the name as " Mentulau " , and later the name evolved into " Bentulu " and , finally , " Bintulu " . + + = = History = = + + + = = = Brooke dynasty = = = + + James Brooke was appointed the White Rajah of Sarawak ( now known as Kuching ) by the Bruneian Empire in 1841 . In 1861 , the Sultanate of Brunei ceded the Bintulu region to Brooke . Bintulu was a small settlement at that time . A wooden fort named Fort Keppel was built in the village , named after Sir Henry Keppel , who was a close friend of the Rajah James and Charles Brooke . Sir Henry Keppel was responsible for crushing the Dayak piracy in the Saribas between 1840 and 1850 . Meanwhile , Charles Brooke was a nephew of James Brooke and would later become the latter 's successor as the second Rajah of Sarawak . Odoardo Beccari , an Italian botanist , visited Bintulu in 1867 . On 4 August , he started his journey on a gunboat named " Heartsease " , which was to send $ 6 @,@ 000 to Brunei for concessions being made to James Brooke in the Mukah and Bintulu regions . He went to Labuan before coming back to Bintulu . He dropped off at Kemena River on 13 August 1867 . His observations of the village were recorded as follows : + The fort of Bintulu which was built entirely of wood , was in somewhat ruinous condition . It stood nearly on the sea @-@ shore , and just behind it , at a distance of few paces , the primeval forests commenced ... Some chinamen had settled at the vicinity of the fort and had built a small bazaar ; but the village is chiefly formed by the houses of the Melanau beyond the Chinese kampong ( village ) . These Melanaus used to live further up the river , but since the construction of the fort , and the installation of an officer of the Rajah near the mouth of the river , they came to settle near the sea – a thing they would never have dared to do in former days for fear of the attacks of the Lanun pirates and Dayak pirates . + The houses of the Melanau people were built in rows on both sides of the Kemena River , mostly furnished by Nipah and Sago palms . Each house had its own shed projection into the entrance of the river , which was used for the processing of Sago palms . On 8 September 1867 , the first Sarawak General Council meeting ( now Sarawak State Legislative Assembly ) took place here . It was made up of 21 elected local community members ( five British officers and 16 Malay and Melanau local chiefs ) . The Council was formed by Raja Muda Charles Brooke under orders from Rajah James Brooke . The Council is the oldest state legislative assembly in Malaysia . + + = = = Japanese occupation = = = + + During World War II , Rajah Charles Vyner Brooke ordered the construction of airstrips in Kuching , Oya , Mukah , Bintulu , and Miri . Construction of the Bintulu airstrip was started in 1934 under the direction of C. W. Bailey , a Works and Building Inspector for the British Royal Air Force ( RAF ) . All the airstrips were completed except for the Bintulu airstrip , where construction was discontinued in October 1938 due to financial reasons . Japanese forces landed in Miri on 16 December 1941 . Sarawak fell into Japanese hands when they conquered Kuching on 24 December 1941 . When the Japanese invaded Sarawak , Charles Vyner Brooke already left for Sydney ( Australia ) before the attack while his officers were captured by the Japanese and interned at the Batu Lintang camp . During the Japanese occupation , the Japanese used the airstrip for military purposes . However , the airstrip was heavily bombed by Allied forces . The British began reconstruction of the airstrip after the war ; during the project many unexploded bombs were unearthed . + On 5 September 1942 , Japanese Field Marshal Prince Maida ( 前田利为 ) boarded a plane from Kuching to Labuan to officiate an airport that bears his name . However , he never arrived . One month later , the plane was found to have crashed off the coast of Tanjung Datu , Bintulu . The cause of the plane crash was not known . The Japanese later set up a wooden pole memorial made up of Belian wood in Bintulu . The wooden pole was later taken back to Japan by the family of Prince Maida . + Chinese sawmill owners at Sibu and Bintulu were instructed by the Japanese to produce timber for repairs at oil fields and ship building . During the Japanese occupation , sawmills at Bintulu produced a total of 4 @,@ 000 tons of sawn timber . + + = = = Post @-@ war period = = = + + In the 1950s , major economic activities in Bintulu were the timber extraction industry , fishing , and Sago processing . In the 1960s , Bintulu was still a small fishing village , with a population of 5 @,@ 000 . No roads were constructed in Bintulu until 1969 when the first untarred road was built to connect Bintulu to Miri . The first bus that serviced the Miri – Bintulu route was owned by Majlis Amanah Rakyat ( MARA ) . The MARA bus line was an initiative by the Malaysian federal government to provide public transportation for the people . The Iban villagers paid the bus driver with " vegetables , chickens , bamboo shoots , and other items " . Before 1960 , Bintulu was connected to Kuching by sea through a ship named " Swee Joo " . After 1960 , the ship " Chin Chin " was added to the route . It took around 36 to 48 hours to reach Bintulu from Kuching , depending on the sea conditions . Due to lack of food supplies from Kuching , the villagers had to make do with limited food , and several villagers resorted to hunting in the jungles to supplement the food supply . + In 1960 there were only three primary schools in Bintulu . These schools provided classes until Primary 3 level . There were no secondary schools . Villagers could pursue their secondary school studies at either Miri or Kuching by using small boats as there were no roads connecting Bintulu to either Miri or Kuching . Bintulu Government Secondary School was opened in 1964 . In 1967 Bintulu celebrated the first 100 years of the Council Negri meeting ( Sarawak State Legislative Assembly ) . A stone monument was built in front of a government rice storeroom to commemorate the event . Bintulu was a sub @-@ district of Miri Division in the 1970s . The sub @-@ district was upgraded into a district in 1987 . + + = = = Discovery of oil and gas reserves = = = + + Large reserves of natural gas were discovered off the coast of Bintulu in 1969 . Following this , a feasibility study was done in 1975 , and Tanjung Kidurong was found to be a suitable site for a deep @-@ water port . On 14 June 1978 , Malaysia LNG Sdn Bhd ( MLNG Satu ) was established by Petronas , a Malaysian national oil and gas company for Liquefied Natural Gas ( LNG ) processing at Bintulu . On 8 July 1978 , the Bintulu Development Authority ( BDA ) was established by the Sarawak state government for infrastructure development and to promote industrial investment in the area . On 15 August 1981 , the Bintulu Port Authority was established at Tanjung Kidurong , starting operation on 1 January 1983 . Since the establishment of Sarawak Corridor of Renewable Energy ( SCORE ) in 2008 , Bintulu become the gateway to Samalajau Industrial Park , which is located 62 kilometres ( 39 mi ) away from Bintulu . The industrial park is a centre of heavy , energy @-@ intensive industry . Among the companies that started their operations in the industrial park are Tokuyama Malaysia Sdn Bhd , Press Metal Bintulu Sdn Bhd , and OM Materials Sdn Bhd . + Rural – urban migration is significant in Bintulu because of greater job availability in the town . Since 2007 , new residents have started several squatter areas in Bintulu due to inability to find affordable housing , around Kidurong Industrial estate and Sungai Sebatang . To address the issue , several low @-@ cost housing projects were initiated by BDA and Sarawak state government to relocate the squatters . The state government planned to achieve zero squatters status by the year 2020 . Bintulu also saw the rise in the number of residential and commercial properties such as double @-@ storeyed terraced houses , terraced shopoffices , Kidurong Commercial Centre , and Time Square Shopping Mall . Residential properties has shown a 20 % price increase from 2011 to 2013 . + + = = Governance = = + + Bintulu is represented by Bintulu parliamentary seat ( P. 217 ) in the Parliament of Malaysia . The town is also represented by three state assembly seats – Jepak , Kidurong ( later was split by two state assembly namely Tanjung Batu and Samalaju ) , and Kemena – in the Sarawak State Legislative Assembly . + + = = = Local authorities = = = + + Since 1978 the town of Bintulu has been administered by the Bintulu Development Authority ( BDA ) , with offices located along Jalan Tanjung Kidurong . The town is located within the boundary of Bintulu District , with a population of 183 @,@ 402 and a total area of 7 @,@ 220 @.@ 40 square kilometres ( 2 @,@ 787 @.@ 81 sq mi ) . Bintulu Division was formerly a Bintulu District under the jurisdiction of Miri Division . The former Bintulu District was upgraded to Bintulu Division on 1 January 1987 . At the same time , Bintulu sub @-@ district was upgraded to the present @-@ day Bintulu District . Both the Bintulu Resident and District offices are located inside Wisma Residen , Pisang Keling Street , Bintulu . + + = = Geography = = + + Bintulu is located 610 kilometres ( 380 mi ) northeast of Kuching 216 kilometres ( 134 mi ) northeast of Sibu , and 200 kilometres ( 120 mi ) southwest of Miri . Bintulu is located near the mouth of the Kemena River , in the coastal region of central Sarawak . Geology of the coastal area was formed during the Pleistocene period ; silt , clay , and gravel can be found here . Geological formation from the Oligocene period is found in the inland area , which contains limestone , siltstone , and sandstone . The soil is generally soft . + + = = = Climate = = = + + There are two monsoon seasons in the Bintulu : the northeast moonsoon ( November to March ) and the southwest moonsoon ( May to September ) . The calm period between these two moonsoons is known as the transitional period . In the coastal region , maximum rainfall occurs in the month of January , while minimal rainfall occurs from the period June to August . Rainfall is more evenly distributed in the inland areas . The annual rainfall of the Bintulu region is about 3 @,@ 750 mm ( 148 in ) annually . The mean daily hours of sunshine at Bintulu is about 5 @.@ 0 to 5 @.@ 5 hours . Bintulu receives on average 14 to 15 mJ / m2 of radiation throughout the year . Bintulu 's relative humidity is 85 % . + + = = Demographics = = + + The growth of Bintulu 's population is shown below : + The issue of gangsters in Bintulu was first raised in 2007 by the member of parliament ( MP ) for Bintulu . The gangsters may have run businesses related to illegal logging , controlling the prices of diesel , eggs , fertiliser and gas cylinders . Bintulu police have been cracking down on gangster activities in the town . Unscrupulous businessmen who seek cheap labour have caused a rise in the number of illegal immigrants in Bintulu . The number may have reached 50 @,@ 000 in 2009 . Bintulu immigration department has performed several operations to deport illegal immigrants back to their home country . + + = = = Ethnicity = = = + + As of the 2010 Malaysian census , the population of the town of Bintulu is 114 @,@ 058 . Indigenous people accounted for the largest proportion of the town population ( 61 @.@ 2 % , 69 @,@ 782 ) , followed by Chinese ( 25 @.@ 0 % , 28 @,@ 512 ) , Non @-@ Malaysians ( 13 @.@ 1 % , 14 @,@ 939 ) , and Indians ( 0 @.@ 28 % , 319 ) . Among the indigenous groups , there are Iban ( 32 @,@ 992 ) , Malay ( 14 @,@ 945 ) , Melanau ( 14 @,@ 179 ) , Bidayuh ( 1 @,@ 598 ) , and other indigenous tribes ( 6 @,@ 068 ) . According to government sources , there are 229 Iban longhouses in the Bintulu District . The Ibans moved into Kemena and Tatau basins in the mid @-@ 19th and early 20th century with permission of the Brooke government . Other indigenous tribes that form the minority are Kayan , Kenyah , and Punan . The Chinese in Bintulu are mainly composed of dialect groups such as Hakka , Fuzhou , and Teochews . The Chinese have been living in the town of Tatau since the era of Bruneian Empire . Later , Fuzhou Chinese from Sibu moved in , dominating the timber and plantation businesses in Bintulu . There is also a large number of foreigners working there . Most of them come from Britain , Australia , the Netherlands , Germany , South Africa , New Zealand , Japan , China , the United States , and Indonesia . + + = = = Languages = = = + + While Malay is the official language of Sarawak ; English is widely spoken there . Local ethnic languages and Chinese dialects are spoken by the respective ethnic groups . Standard Chinese is also spoken by ethnic Chinese in Bintulu . Bintulu is spoken by communities living along the Kemena River , with 4 @,@ 200 native speakers . These speakers are now recognised as part of the Melanau ethnic group , where their main language is Malay . Bintulu is classified as one of the endangered languages in Sarawak because of the isolated usage of the language in a small community . + + = = = Religion = = = + + The majority of the Bintulu population are adherents of Christian denominations due to Christian missionaries operating during the Brooke dynasty ; followed by Islam , Buddhism , and Hinduism . Among the notable places of worship in Bintulu are the Bintulu Mosque ( Masjid Assyakirin ) , Masjid Jepak , Tua Pek Kong Temple , Eng Kwang Methodist Church , and St. Thomas Church . The respective religious groups are free to hold processions in the town . + + = = Economy = = + + There are five industrial estates in Bintulu . They are : Kemena Industrial Estate ( for wood @-@ based industries ) , Jepak Industries Estate ( wood @-@ based industries ) , Kidurong Industrial Area ( for medium and light industries ) , Kidurong Light Industrial Estate ( medium and light industries ) , and Bintulu Light Industrial Estate ( light industry ) . + + = = = Oil and gas = = = + + Malaysia LNG is a Liquefied Natural Gas ( LNG ) manufacturing complex located in Bintulu that currently contains eight LNG trains with a ninth one currently under construction . The complex was built by the Malaysian national oil and gas company , Petronas . The manufacturing complex has a production capacity of 25 @.@ 7 million tonnes per annum . Petronas is also planning to open Floating LNG ( FLNG ) offshore Bintulu , which is used specifically to harvest natural gas from small and isolated gas fields . Transportation of natural gas from the neighouring state of Kimanis , Sabah , to the LNG complex at Bintulu is facilitated by a 512 @-@ kilometre ( 318 mi ) pipeline known as the " Sabah Sarawak Gas Pipeline " . Currently , 45 % of Malaysian natural gas is found at Central Luconia off the coast of Bintulu . The largest importers of Malaysia LNG productions are Japan ( 62 % ) , Korea ( 17 % ) , Taiwan ( 12 % ) , and China ( 9 % ) . + The Sarawak Shell Bintulu Plant ( SSBP ) , formerly known as Bintulu Crude Oil Terminal ( BCOT ) , was established in 1979 . It consists of three crude oil storage tanks , each with a capacity of 410 @,@ 000 barrels . It has three major areas of operation : Crude Oil Operations , Condensate Stabilisation , and Gas Sales Facilities . Royal Dutch Shell started to establish the world 's first Shell Middle Distillate Synthesis plant ( Shell MDS ) in 1993 . It is also known as Bintulu Gas @-@ To @-@ Liquid plant ( Bintulu GTL ) . The plant has a production capacity of 14 @,@ 770 barrels per day with a total investment of over US $ 1 billion as of the year 2010 . The plant is staffed with 380 people , of whom 93 % are Malaysians , with 80 % of the staff coming from Sarawak . + + = = = Wood @-@ based industries and plantations = = = + + Since the opening up of the Bintulu – Miri road in the 1970s , large @-@ scale plantations of oil palm and cocoa has been developed in rural areas of Bintulu Division . Currently , there are 57 @,@ 740 hectares ( 577 @.@ 4 km2 ( 223 sq mi ) ) of oil palm , 2 @,@ 000 hectares ( 200 km2 ( 77 sq mi ) ) of Rattan , and 815 hectares ( 8 @.@ 15 km2 ( 3 sq mi ) ) of pepper plantations . + The first Bintulu palm oil refinery , Bintulu Edible Oil Sdn Bhd , was established in June 1991 . Bintulu currently has four palm oil refineries : Bintulu Edible Oils Sdn Bhd ( operated under PGEO Group , a subsidiary of Wilmar International ) , Sime Darby Austral Edible Oil Sdn Bhd , Kirana Palm Oil Refinery Sdn Bhd , and Sarawak Oil Palm Bhd . However , as of 2015 , Wilmar no longer buys raw palm oil produced from cleared forests and peat swamps in Sarawak because of environmental concerns . + The Bintulu Division has been designated as a Planted Forests Zone ( PFZ ) by the Sarawak state government since 1998 . As of 30 June 2011 , a total of 124 @,@ 618 hectares ( 1 @,@ 246 @.@ 18 km2 ( 481 sq mi ) ) has been planted with acacia trees . Other trees that are planned for plantations are kelampayan , engkabang , durian , batai , eucalyptus , and rubber trees . Sarawak Planted Forest Sdn Bhd , a company wholly owned by the Sarawak state government , has been granted a license to replant forests for 60 years . However , the company has been suffering financial losses from 2009 to 2011 . + There are three mills in Bintulu that process wood @-@ waste products . Two are Medium @-@ density fibreboard ( MDF ) plants and the third is a charcoal briquette plant , with a total installed capacity of 246 @,@ 000 cubic metres ( 8 @,@ 700 @,@ 000 cu ft ) per year . MDF plants utilise wood waste purchased from sawmills and plywood mills in the Bintulu area and occasionally from the Tanjung Manis timber processing zone located at the mouth of the Rajang River . Synthetic resins , which are required to hold wood dust together , constituted 20 % of the total production cost of the wood panel products . MDF plants in Bintulu are operated by Daiken Sarawak Sdn Bhd , which was founded on 15 February 1994 . The briquette plant is operated by Cipta Briquette Sdn Bhd . A glue / adhesive factory in Bintulu is owned by Bintulu Adhesive & Chemicals Sdn Bhd . It produces urea formaldehyde resin and phenol formaldehyde resin for plywood and chipboard manufacturing at Kemena Industrial Estate . Urea precondensate is also produced to supply ASEAN Bintulu Fertiliser ( ABF ) plant . + + = = = Others = = = + + The Bintulu Port Authority was established in 1981 . It started port operation in 1983 at Tanjung Kidurong . Following a privatisation exercise , Bintulu Port Sdn Bhd ( BPSB ) was founded on 23 December 1992 and commenced operation on 1 January 1993 . BPA is currently responsible for regulatory exercises and security of the port . Meanwhile , BPSB is responsible for cargo handling at the Bintulu International Container Terminal ( BICT ) . The port also provides Vessel traffic service to shipping vessels . The annual total cargo throughput is 45 @.@ 4 million tonnes , consisting of 58 % LNG and 42 % non @-@ LNG products . As of 31 December 2014 , it generated a total revenue of RM 552 @.@ 3 million per year . Bintulu Port is the busiest port in Sarawak . + The ASEAN Bintulu Fertiliser plant is the anhydrous ammonia and granular plant operated by ASEAN Bintulu Fertiliser Sdn Bhd ( ABF ) , which is partly owned by Petronas . The company was formed on 6 December 1980 . The plant started operation on 1 October 1985 . It is also one of the largest granular urea plants in Asia . It is a joint venture by five ASEAN countries : Malaysia ( 63 @.@ 5 % shares ) , Thailand ( 13 % ) , Indonesia ( 13 % ) , the Philippines ( 9 @.@ 5 % ) , and Singapore ( 1 % ) . + Cahya Mata Sarawak Berhad ( CMSB ) , one of the largest publicly traded companies in Sarawak , set up a cement plant in Bintulu at Kidurong Industrial Estate . The plant , manned by 40 people , produces ordinary Portland cement and Portland blast furnace cement . It currently has a combined production capacity of 2 @.@ 75 million MT ( million tonnes ) . + The Bintulu combined cycle power plant was started in early 2010 with a capacity of 317 megawatts . The power plant is registered under the United Nations Clean Development Management ( CDM ) scheme as of 18 September 2010 . The plant is built to ensure efficient use of energy and reduce green house gas emissions . It is the first CDM power plant in Malaysia , currently operated by Sarawak Power Generation Sdn Bhd ( SPG ) , a wholly owned subsidiary of Sarawak Energy . + + = = Transportation = = + + + = = = Land = = = + + All roads in Bintulu are maintained by the Bintulu Development Authority ( BDA ) . Bintulu is connected to Miri and Sibu by the Pan Borneo Highway . Bintulu is also connected to Mukah and Samalaju Industrial Park . Kemena Bridge crosses the Kemena River . It is the second bridge in Malaysia built using the incremental launch method . Keppel Road in Bintulu is named after a friend of James Brooke , Sir Henry Keppel . + + = = = = Public transportation = = = = + + Bintulu has a long @-@ distance bus station , located at Medan Jaya , 5 km ( 3 mi ) northeast of the town centre . Among the areas served by the bus station are : Miri , Sibu , Kuching , Mukah , Sarikei , Oya , Dalat , Balingian , and Pontianak , Indonesia . The bus companies that serve the station are the Syarikat Baram Sdn . Bhd , MTC , Biaramas , and Suria bus lines . There are also buses that serve the town area . Taxi service is also available . + + = = = Air = = = + + The old Bintulu airport was built in 1955 in the town centre . It once held the Guinness World Record of nearest airport to town . On 19 December 2002 , the airport was replaced by a new airport , which is located 23 km ( 14 mi ) away from the town centre . The surroundings of the old airport were developed into commercial and residential projects while the runaway is reserved for Bintulu International Kite Festival . The new airport has a runway measuring 2 @,@ 745 m ( 9 @,@ 006 ft ) , capable of handling planes as large as the Airbus A330 . The airport currently serves three major airlines : Malaysia Airlines ( MAS ) , Air Asia , and MASwings , connecting to domestic destinations such as : Sibu , Miri , Kuching , Kuala Lumpur , and Kota Kinabalu . + + = = = Water = = = + + There is a wharf terminal at Bintulu that serves the rural areas of Bintulu Division . Among the destinations that can be reached by express boat from Bintulu are : Sebauh , Pandan , Labang , Tubau , and Binyo . + + = = Other utilities = = + + + = = = Courts of law and legal enforcement = = = + + The current court complex is located at Pisang Emas Road . It comprises the High Court , the Sessions Court , and the Magistrate Court . Bintulu also has Syariah Subordinate Court , located at Tanjung Kidurong , whose area of jurisdiction covers Bintulu District and Tatau districts . The Bintulu central police station is located at Tun Hussein Onn Road , with other police stations located at Tanjung Kidurong , Tubau , and Sebauh . There is also a central prison in Bintulu , which doubles as a correctional centre . + + = = = Healthcare = = = + + Bintulu Hospital started operation in 1968 . It is located at Nyabau Road , 12 km ( 7 @.@ 5 mi ) from the town centre . Following renovations completed on 21 May 2000 , the hospital is now equipped with 200 beds . As of 2011 , the hospital provides speciality services in seven medical disciplines . Bintulu also has one polyclinic , Polyclinic Bintulu . There are two private hospitals in Bintulu : Columbia Asia Hospital and Bintulu Medical Centre . + + = = = Education = = = + + There are about 50 primary and eight secondary schools in Bintulu . All the schools under the National Education System are managed by the Bintulu District Education Office . The oldest primary schools in Miri are St Anthony 's Primary School ( Roman Catholic Mission School ) , Chung Hua Primary School , and the Orang Kaya Mohammad Primary School , which were established in the early 1960s . The Bintulu Government Secondary School was built in 1964 . It is now known as SMK Bintulu , the oldest secondary school in the town . Bintulu also has one Chinese independent school , Kai Dee Middle School ( 开智中学 ) . The Shell Oil Company established the Kidurong International School in 1982 to meet the primary education needs of Shell employees ' children . The school provides English National Curriculum ( ENC ) for literacy and numeracy and International Primary Curriculum ( IPC ) for other subjects . + UPM Bintulu Sarawak Campus was started as the National Resource Training Centre , Kuching , in 1974 . The oldest campus in Sarawak , it was relocated to Bintulu in 1987 as a branch campus of the Universiti Pertanian Malaysia ( UPM ) . The campus was closed down in 1992 before reopening in 2001 as Universiti Putra Malaysia ( UPM ) . During this period of closure , the campus was used as the site for Maktab Perguruan Sains Bintulu ( Bintulu Science Teachers ' Training College ) from 1994 to July 1999 when it was moved to Kota Samarahan as Institut Pendidikan Guru Kampus Tun Abdul Razak ( Tun Abdul Razak Teachers ' Training Institute Campus ) . The UPM campus is currently located 13 km ( 8 mi ) away from the town centre , occupying 715 ha ( 1 @,@ 767 acres ) , which can accommodate up to 2 @,@ 200 students . This branch campus currently has only one faculty , the Faculty of Agriculture and Food Sciences , consisting of five academic departments . In 2015 UPM was ranked 41st in the UI @-@ Greenmetric World University rankings . SEDAMAI College , established in November 1999 , offers courses in business , information technology , language , and engineering . + There is also a technical school located 15 km ( 9 mi ) away from the town , near Tanjung Kidurong , occupying 20 ha ( 49 acres ) of land . The school was built in 1982 with a maximum capacity of 900 students . Among the courses offered are : automotive , mechanical and civil engineering , commerce , and fashion . Gulf Golden International Flying Academy ( GGIFA ) , the first and only flying academy in Sarawak , was closed in 2012 due to financial difficulties . + + = = = Libraries = = = + + The first public library in Bintulu was built in 1971 by Bintulu District Council ( BDC ) . In 1988 the library was demolished to make way for car parks . Books from the library were moved into the former BDC building . On 29 May 2000 , the Bintulu Development Authority ( BDA ) public library was built near the Bintulu Civic Centre , which is 2 km ( 1 @.@ 2 mi ) from the town . The public library has three branches : at Kidurong , Tatau , and Sebauh . + + = = Culture and leisure = = + + + = = = Attractions and recreational spots = = = + + + = = = = Cultural = = = = + + Kampung Jepak ( Jepak village ) is a Melanau fishing village in Bintulu located near Kemena River . Among the daily activities in this village are Sago processing , fish drying , and the manufacturing of Belacan , Cencaluk ( salted shrimp ) , Terendak ( Melanau headgear ) , and Tutop ( a type of food cover ) . Kuan Yin Tong temple is located at KM2 Jalan Sultan Iskandar . It has a structural design with a rock garden courtyard , man @-@ made waterfall , and dragon fencing . Assyakirin mosque , meaning " Gratefulness to God " , has a man @-@ made waterfall , a fountain , and a landscape planted with flowers . The Borneo International Kite Festival has been held yearly since 2005 at the old Bintulu airport runway . It usually lasts for four to five days in September . + + = = = = Historical = = = = + + In 1987 a clock tower and a fountain were erected at Council Negri Monument . A centenary stone that was erected in 1967 to commemorate the event is kept under the clock tower . The Bintulu Tua Pek Kong temple ( near Tamu Bintulu ) is believed to have been built in the 1890s to purge the town from evil spirits . The temple survived World War II , and was rebuilt after the discovery of oil and gas reserves offshore . + + = = = = Leisure and conservation areas = = = = + + Similajau National Park is located 30 km ( 19 mi ) northeast of the town . The park was gazetted in 1976 , covering an area of 8 @,@ 996 ha ( 22 @,@ 230 acres ) ( 89 @.@ 96 km2 ( 35 sq mi ) with sandy beaches , rocky headlands , jungle streams , and forests . Other national parks that can be accessed along the Miri – Bintulu road are the Lambir Hills National Park and Niah National Park . + Tanjung Batu beach ( Temasya beach ) is located 3 km ( 1 @.@ 9 mi ) from the town centre . Meanwhile , Taman Tumbina ( Tumbina Park ) is located 4 km ( 2 @.@ 5 mi ) from the town centre . The park has a hornbill aviary , a butterfly garden , and a mini @-@ zoo . + + = = = = Other attractions = = = = + + Kidurong Tower is an observation tower located at Tanjung Kidurong . It offers a view of the Bintulu oil and gas facilities shortly after nightfall . Bintulu Promenade is a 3 @-@ kilometre ( 2 mi ) walkway along the Bintulu coastline with the Kemena River mouth as its focal point . It has the observation points offering sunset views . There is also an 18 @-@ hole golf course at Bintulu . + + = = = = Shopping = = = = + + There are several shopping malls in Bintulu : ParkCity Mall , City Point , Ngiu Kee Departmental Stores , Farley shopping complex , Sing Kwong Supermarkets , and MDS @-@ Mart . Time Square Mall is currently under development in Bintulu , with Everrise as the anchor tenant . + Tamu Bintulu and Pasar Utama are the two main markets in the town . Both places have a unique cone @-@ shaped roof that symbolises the traditional Melanau headgear named Terendak . Tamu Bintulu offers items ranging from jungle produce to native home @-@ made specialties such as Belacan . Meanwhile , Pasar Utama houses both wet market and dry market under one roof , providing fresh vegetables , fruits , fish , and dairy products . The first floor of Pasar Utama offers a variety of fast food such as Laksa , Kolok Mee , Jawa Mee , Pulut Panggang , ais batu campur , cendol , and teh tarik . The Bintulu night market is located on Kampung Dagang road . There are over 150 stalls selling a variety of items such as garments , electric goods , vegetables , fruits , food and drinks . + + + = Zrinski Battalion = + + The Zrinski Battalion ( Croatian : Bojna Zrinski ) was a special forces unit of the Croatian National Guard ( Zbor narodne garde – ZNG ) and later of the Croatian Army ( Hrvatska vojska – HV ) established in Kumrovec on 18 May 1991 , during the Croatian War of Independence . The unit drew personnel from the special police forces and a former French Foreign Legion troops serving as its core . The battalion was set up and initially commanded by Ante Roso , while Major Miljenko Filipović took over as the commanding officer in August . + The Zrinski Battalion trained volunteer troops in Vukovar in June 1991 before it saw action in Hrvatska Kostajnica , the Battle of Gospić and near Slano in 1991 . By the end of 1991 , the unit 's personnel were tasked with setting up an additional special forces unit of the HV . The next year its elements took part in the Battle of Kupres and Operation Tiger aimed at lifting the Siege of Dubrovnik . It also and helped develop and train the Croatian Defence Council ( Hrvatsko vijeće obrane – HVO ) , setting up a training camp in Tomislavgrad . In 1993 , the battalion took part in Operation Maslenica . In February 1994 , the Zrinski Battalion was amalgamated with several other HV special forces units into the 1st Croatian Guards Brigade ( 1 @.@ hrvatski gardijski zdrug ) , a component of the 1st Croatian Guards Corps ( 1 @.@ hrvatski gardijski zbor ) . + + = = Background = = + + In 1990 , following the electoral defeat of the government of the Socialist Republic of Croatia , ethnic tensions between Croats and Croatian Serbs worsened . The Yugoslav People 's Army ( Jugoslavenska narodna armija – JNA ) believed Croatia would use the Croatian Territorial Defence Force 's ( Teritorijalna obrana – TO ) equipment to build its own army and confront the JNA itself . In order to minimize the expected resistance , the JNA confiscated the TO weapons . On 17 August , the tensions escalated into an open revolt of the Croatian Serbs . + In the beginning of 1991 , Croatia had no regular army . In an effort to bolster its defence , Croatia doubled the size of its police force to about 20 @,@ 000 . The most effective part of the force was the 3 @,@ 000 @-@ strong special police that were deployed in 12 battalions , adopting military organisation . In addition there were 9 @,@ 000 – 10 @,@ 000 regionally organised reserve police . The reserve police were set up in 16 battalions and 10 companies , but they lacked weapons needed to arm many of the troops . + Preparations to set up the Croatian National Guard ( Zbor narodne garde – ZNG ) began on 12 April 1991 . Establishment of the ZNG as a police force with military capabilities was thought necessary by the Croatian authorities following armed clashes in Pakrac and at Plitvice Lakes in March and due to the possibility of further confrontation with the JNA . The ZNG , formally established on 23 April , was tasked with protection of the constitutional order , maintenance of public order , anti @-@ terrorist operations , protection of Croatia 's borders , territory , coast and territorial waters , as well as the protection of high @-@ value structures and high @-@ profile persons . + + = = Service = = + + On 18 May 1991 , the Zrinski Battalion was established as a special forces unit of the ZNG . The core of the unit consisted of 27 volunteers drawn from the Kumrovec Special Police Unit ( SPU ) . Initially , it relied on former French Foreign Legion troops . The most senior among the former legionnaires was Ante Roso , previously a Sous @-@ Officier ( non @-@ commissioned officer – NCO ) in the 4th Foreign Regiment . In consequence , Roso was tasked with setting up the unit as its initial commander . Major Miljenko Filipović , likewise a former French Foreign Legion member , was assigned the battalions deputy commander . The unit was based in the village of Kumrovec in the region of Hrvatsko Zagorje , on the grounds of the former " Josip Broz Tito " political school . The site , adjacent to the border of Slovenia , was selected to be inaccessible to Yugoslav Air Force raids without violation of Slovene or possibly Austrian airspace . In June 1991 , the Kumrovec SPU was transferred to Sljeme Peak north of Zagreb leaving Kumrovec base to the Zrinski Battalion , as well as the second special forces unit , the Frankopan Battalion . + The Zrinski Battalion was deployed for the first time on 15 June . It was stationed in Vukovar , tasked with preparation of city defences and organisation of volunteer troops . In August , Filipović took over command of the battalion from Roso . The same month , the Zrinski Battalion was deployed to the Banovina , where it pushed the Croatian Serb forces out of the town of Hrvatska Kostajnica . In September , the battalion was deployed to Gospić , where it took part in battle to control Gospić against the JNA . Troops assigned to the battalion captured Kaniža barracks in Gospić . During combat in Gospić , 30 troops of the Zrinski Battalion , assisted by Lučko SPU , captured JNA Major General Trajče Krstevski , along with three armoured personnel carriers ( APCs ) and 32 soldiers . The unit was deployed to Metković on 28 October , tasked with recapturing Slano from the JNA . After the deployment to Gospić , a part of the unit personnel left to Bosnia and Herzegovina anticipating further conflict there , while the remainder of the unit returned to Kumrovec . The ZNG was renamed the Croatian Army ( Hrvatska vojska – HV ) on 3 November 1991 . In late 1991 , personnel of the Zrinski Battalion set up another special forces unit of the HV — the Matija Vlačić Battalion based in Opatija . + In 1992 , elements of the Zrinski Battalion took part in the Battle of Kupres , before setting up a training camp in the town of Tomislavgrad . There the battalion personnel assisted in setting up and trained the Croatian Defence Council ( Hrvatsko vijeće obrane – HVO ) . Later that year , elements of the battalion took part in Operation Tiger — aimed at lifting of the Siege of Dubrovnik . In 1993 , elements of the Zrinski Battalion took part in Operation Maslenica , fighting in the area of Škabrnja . The Central Intelligence Agency assessed the Zrinski Battalion as one of the best units of the HV . + + = = Amalgamation = = + + On 25 February 1994 , the Zrinski Battalion was amalgamated with parts of other special forces units of the HV : Frankopan Battalion , Ban Jelačić Battalion , Matija Vlačić Battalion , Ferdo Sučić Battalion and part of 8th Light Assault Brigade forming the 1st Croatian Guards Brigade ( 1 @.@ hrvatski gardijski zdrug ) , a component of the 1st Croatian Guards Corps ( 1 @.@ hrvatski gardijski zbor ) , directly subordinated to the Ministry of Defence rather than the General Staff of the Armed Forces of the Republic of Croatia . + + + = Weevils Wobble But They Don 't Go Down = + + " Weevils Wobble But They Don 't Go Down " is the nineteenth and penultimate episode of the third season of the American mystery television series Veronica Mars , and the 63rd episode overall . Written by Phil Klemmer and directed by Jason Bloom , the episode premiered on The CW on May 22 , 2007 . The series depicts the adventures of Veronica Mars ( Kristen Bell ) as she deals with life as a college student while moonlighting as a private detective . + In this episode , Weevil ( Francis Capra ) enlists Veronica 's help in proving his innocence when he is implicated in creating fake student IDs . Meanwhile , Veronica and Piz ( Chris Lowell ) come to terms with Veronica 's FBI internship , and a sex tape of this couple is released on the internet . Logan ( Jason Dohring ) beats up Piz , thinking that he posted it . In addition , Keith ( Enrico Colantoni ) and Vinnie ( Ken Marino ) debate on Piz 's radio show about the upcoming Sheriff 's election in which they are running against each other . + " Weevils Wobble But They Don 't Go Down " features the reappearance of Weevil after an absence of five episodes ; during filming of the third season , Capra was undergoing medical treatment . Series creator Rob Thomas pointed out Logan and Piz 's fight scene as one of the highlights of the episode and the season . In its original broadcast , the episode received mostly positive reviews from television critics , with many praising the case @-@ of @-@ the @-@ week . + + = = Synopsis = = + + In a checkout line , Veronica and Mac ( Tina Majorino ) discuss her relationship with Piz and what her FBI internship will mean for them . They then see someone getting arrested by the campus police for a fake debit card . Weevil injures himself , but finds himself unable to receive adequate benefits . Veronica gets Piz to accept her leaving for the internship . At the Sheriff ’ s station , Weevil is called out by several students as being the one who gave them the fake cards . Weevil is put in the jail cell , but he tells Veronica that he think they targeted him after he was showcased in the criminology class . She investigates several of the owners of the student IDs , who do seem genuine in accusing Weevil . Weevil gets bail , but Keith informs him that a student ID printing machine was found in the locker next to Weevil ’ s . Veronica and Wallace speak to a mechanical engineering professor , who says that it would be impossible for Weevil to be the culprit . Piz invites Keith to appear on his show in an election special before Keith informs Veronica that Weevil ’ s fingerprints were found all over the investigation . + Keith and Vinnie debate on Piz ’ s radio show , and Vinnie gathers some support about his lax underage drinking policy . Wallace notices someone following him , and the student wants to recruit him for a secret society . Vinnie insults Keith ’ s handling of his home life , and Veronica hits him on the head lightly . She helps Weevil retrace his steps and notices one of her criminology classmates , Jenny ( Dianna Agron ) , involved with one of Weevil ’ s clients . Veronica deduces that Jenny and her circle of friends are responsible for creating the fake student IDs , disseminating them , and implicating Weevil . Logan and Dick ( Ryan Hansen ) go surfing , and they run into Veronica , Piz , and Mac helping Wallace ( Percy Daggs III ) with his final project for a class . Veronica learns that one of Jenny ’ s group lives in the same town in Georgia where the student ID machines are made . + Veronica is called into a room in the library by Jenny ’ s group , who try to bribe her into not turning them in to the Hearst police . However , she denies and records their conversation as further proof . They are hiding a third ID machine , and Veronica tells Weevil to go and find it . Dick shows Logan a sex tape of Piz and Veronica that has been circulating in an email . Because Logan thinks Piz posted the tape , he attacks Piz and hits him incessantly . Weevil does have a student ID machine , and Logan walks into Mars Investigations with Piz ’ s blood on him . + + = = Production = = + + " Weevils Wobble But They Don 't Go Down " was written by Phil Klemmer and directed by Jason Bloom , marking Klemmer 's fifteenth and final writing credit and Bloom 's fourth and final directing credit for Veronica Mars , after " Green @-@ Eyed Monster " , " Nevermind the Buttocks " , and " Charlie Don 't Surf " . The episode features the final appearance of Dianna Agron , famous for her role as Quinn Fabray on Glee , as Jenny Budosh , Veronica ’ s classmate . Agron had previously appeared in the episode “ President Evil ” . The episode prominently features Weevil ( Francis Capra ) , who appears after a five episode hiatus . During the third season , Capra was struggling with a medical condition , and the medication he took for this illness caused his face and neck to swell and break out with acne . + Series creator Rob Thomas included the scene in which Logan attacks Piz on his list of highlights from the third season . When Thomas first viewed the scene with his wife , she emitted an audible groan when she saw Logan walking through the cafeteria towards Piz , anticipating the fight that was about to come . Thomas thought that most fight scenes on the show were not well @-@ done , as they are filmed quickly and without a second unit . However , he was pleased with the final cut , stating that there was only one punch that he thought looked fake . + + = = Reception = = + + + = = = Ratings = = = + + In its original broadcast , “ Weevils Wobble But They Don ’ t Go Down ” received 1 @.@ 78 million viewers , ranking 77th of 85 in the weekly rankings . This was a decrease from the previous episode , “ I Know What You 'll Do Next Summer ” , which garnered 2 @.@ 10 million watchers . + + = = = Reviews = = = + + Eric Goldman , writing for IGN , graded the episode a 9 @.@ 0 out of 10 , indicating that it was “ amazing ” . His very positive review focused on the ambiguous nature of the main plot and the subplots of Dick and Logan . He praised the presence of the majority of the main cast members , also stating that Dick ’ s reaction to dealing with his brother was realistic for him and in character . The reviewer called the conclusion to the case @-@ of @-@ the @-@ week a “ fun and satisfying one ” that highlighted the moral ambiguity of main characters , something “ that the show had been lacking for a while . ” He highlighted this ambiguity in both Weevil in Logan , elaborating that Logan attacking Piz showed an interesting side to his personality that had not been present all season . The reviewer also lauded the cliffhanger ending , stating , “ it was a reminder of how exciting and intense this show can be at its best . ” Television Without Pity did not grant the episode a rating but lauded the characterization of Weevil , stating , “ This is the most consistently written character on the show , bar none . ” + Kelly West of Cinema Blend focused primarily on the series finale in her review but referred to this episode as containing a solid case @-@ of @-@ the @-@ week . “ Overall it was a good mystery @-@ of @-@ the @-@ week but seeing as it was the second to last episode , the only thing I cared about was that we finally got more Weevil ! ” Rowan Kaiser of The A.V. Club gave the episode a mixed review , stating that the case @-@ of @-@ the @-@ week was an oversimplification of the overall themes of Veronica Mars . The reviewer enjoyed the potential for Logan and Dick ’ s character development , although he felt that it was blunted by the knowledge that the series was about to end . “ As fun as this episode is , the lack of the fourth season stops me from fully enjoying it . [ … ] But the knowledge that this might be the last time we see them puts a damper on things . Still , better to go out with good episodes than bad . ” + + + = Temple Beth Israel ( Eugene , Oregon ) = + + Temple Beth Israel ( Hebrew : בית ישראל ) is a Reconstructionist synagogue located at 1175 East 29th Avenue in Eugene , Oregon . Founded in the early 1930s as a Conservative congregation , Beth Israel was for many decades the only synagogue in Eugene . + The congregation initially worshipped in a converted house on West Eighth Street . It constructed its first building on Portland Street in 1952 , and occupied its current LEED @-@ compliant facilities in 2008 . + In the early 1990s conflict between feminist and traditional members led to the latter leaving Beth Israel , and forming the Orthodox Congregation Ahavas Torah . Beth Israel came under attack from neo @-@ Nazi members of the Volksfront twice , in 1994 and again in 2002 . In both cases the perpetrators were caught and convicted . + Services were lay @-@ led for decades . Marcus Simmons was hired as the congregation 's first rabbi in 1959 , but left in 1961 . After a gap of two years , Louis Neimand became rabbi in 1963 , and served until his death in 1976 . He was followed by Myron Kinberg , who served from 1977 to 1994 , and Kinberg in turn was succeeded by Yitzhak Husbands @-@ Hankin . Maurice Harris joined Husbands @-@ Hankin as associate rabbi in 2003 , and served until 2011 , when he was succeeded by Boris Dolin . As of 2014 , led by Husbands @-@ Hankin and Dolin , Beth Israel had approximately 400 member households , and was the largest synagogue in Eugene . + + = = Early history = = + + Small numbers of German Jews began settling in Eugene in the late 19th century , but most moved on . In the early 20th century the first Eastern European Jews settled there , and by the 1920s Eugene 's Jewish community began gathering prayer quorums for holding Friday night and Jewish holiday services in individuals ' homes . Historian Steven Lowenstein writes that " [ a ] fter Hymen Rubenstein 's death in 1933 , his home at 231 West Eighth Street was remodeled and named Temple Beth Israel " . It was a traditional Conservative synagogue , and from that time until the 1990s it was the only synagogue in Eugene . + In 1952 , the congregation constructed a one @-@ story synagogue building on an almost 1 acre ( 0 @.@ 40 ha ) property at 2550 Portland Street . Designed by architect and Holocaust @-@ survivor Heinrich Hormuth ( H.H. ) Waechter , the building featured an interior courtyard that provided natural lighting , and " a network of ceiling beams painted with symbols and shapes " by Waechter . + Temple Beth Israel 's services and religious functions were lay @-@ led for decades . Its first rabbi was Marcus Simmons . Originally from England , he was a graduate of University of London and Oxford University , and was ordained at the Hebrew Theological Seminary . He emigrated to the United States in 1957 , and joined Beth Israel in 1959 . The members were not , however , agreed that a full @-@ time rabbi was required , and in 1961 , he accepted a rabbinical position in Downey , California . + Following a hiatus of two years , Louis Neimand was hired as rabbi in 1963 . Born in New York City in 1912 to immigrant parents , he was a graduate of City University of New York and was ordained at the Jewish Institute of New York . He had previously worked for the United Jewish Appeal , and from 1959 to 1963 was the first Hillel rabbi at Syracuse University . There was some concern about Neimand 's hiring , as he had a police record as a result of his involvement in freedom marches in the African @-@ American Civil Rights Movement ( 1955 – 1968 ) . He served until his death in 1976 . + + = = Kinberg era = = + + Myron Kinberg was hired as rabbi in 1977 . Ordained in Reform Judaism , he had previously served as a rabbi in Topeka , Kansas for two years , then lived in Israel for two years , before coming to Eugene . Kinberg was known for his support for minority rights and gay rights , anti @-@ nuclear and anti @-@ war activism , support of reconciliation between Israel and the Palestinians , and outreach to non @-@ observant members of Eugene 's Jewish community . + Kinberg attempted to revive the Biblical concept of the " ger toshav " in his approach to intermarriage . He was willing to officiate at an intermarriage if the non @-@ Jewish partner , after discussions with the rabbi , agreed of his or her own free will to fulfill a set of commitments , including " a commitment to a Jewish home life , participation in Jewish life and tradition , and raising future children as Jews " . The non @-@ Jewish partner making this commitment became a " ger toshav " , or " non @-@ Jewish member of the Jewish people " . + Kinberg 's wife Alice was a strong feminist , and during the 1980s he and his wife supported a number of changes to the liturgy and ritual . These included allowing women to read from the Torah and lead the prayers , and changing prayers to be more gender inclusive - for example , using gender @-@ neutral terms and pronouns for God , and adding references to the Biblical matriarchs in prayers like the Amidah , which traditionally only mentioned the Biblical patriarchs . While most congregation members approved of these changes , a minority resisted them . + + = = = Schism = = = + + By the early 1990s serious divisions developed among the members of the congregation over a number of issues , including personal antagonisms , the rabbi 's activism and " advocacy of ' ultra @-@ liberal ' causes " , political differences over the Israeli – Palestinian conflict , and + a myriad of additional Jewish cultural / religious issues , such as the acceptance of intermarried couples , adherence to kosher dietary laws , the use of modern language and music during worship services , rewriting of certain prayers such as the Aleynu to make them less ethnocentric , and so on . + However , the biggest source of division , which underlay all others , was " the roles and rights of men and women in the synagogue . " + In the early 1990s a group of newly observant members began holding more traditional services in a back room of the synagogue , complete with a mechitza , a partition separating men and women . The " more feminist @-@ minded " members strongly objected to having a mechitza anywhere in the Temple Beth Israel building , even if it were not in the services they attended . The latter group eventually circulated a petition which stated that either the mechitza would have to be taken down , or those members who wanted it would have to leave . Kinberg also signed the petition . Faced with this opposition , in 1992 the Orthodox members left , renting new premises and hiring their own rabbi , creating Eugene 's second synagogue , originally called " The Halachic Minyan " , and in 1998 renamed " Congregation Ahavas Torah " . + Kinberg held himself responsible , and the schism led to his " reassessment of the needs of Temple Beth Israel and his role as a rabbi " . As a result , he left Beth Israel in 1994 to lead a synagogue on Long Island . During his tenure at Beth Israel , membership rose from 118 to 350 families . Kinberg died two years later at age 51 . + + = = Husbands @-@ Hankin era = = + + Yitzhak Husbands @-@ Hankin succeeded Kinberg in 1995 . Husbands @-@ Hankin began his involvement at Temple Beth Israel first as a congregant , then as cantor , and then as an assistant rabbi . He was active in forming the Jewish Renewal movement , and was ordained by its leader Zalman Schachter @-@ Shalomi . + The congregation decided to leave the Conservative movement in 1995 , and for a year had no affiliation . In late 1996 , after considering both Reform and Reconstructionist as alternatives , the congregation affiliated with the Reconstructionist movement . By 1999 , membership had grown to around 370 families . + Husbands @-@ Hankin was instrumental in developing the concept of " Ethical Kashrut " , the idea that one should only purchase goods that are produced in an ethical way . His essay , " Ethical Kashrut , " was selected for publication in Arthur Kurzweil 's Best Jewish Writing 2003 . A singer , cello and guitar player , he composes and performs Jewish music . + Husbands @-@ Hankin has had four assistant or associate rabbis working with him . Shoshana Spergel joined Temple Beth Israel in 1998 as interim rabbi when Husbands @-@ Hankins went on a sabbatical ; Jonathan Seidel was assistant rabbi from 2001 to 2003 . Maurice Harris , a 2003 graduate of the Reconstructionist Rabbinical College , joined as assistant rabbi in 2003 . He is one of the signators of The Open Letter Concerning Religion and Science From American Rabbis , part of the Clergy Letter Project which " encourages and embraces the teaching of evolution in schools " . In 2011 , Boris Dolin joined the congregation as its newest associate rabbi . + + = = = Attacks by neo @-@ Nazis = = = + + On March 20 , 1994 , Chris Lord , an individual associated with the Volksfront and American Front , fired ten rounds with an assault rifle into the temple , damaging the interior . The attacks were prompted by a newspaper article about several members of Eugene 's Jewish community , including a lesbian . Community organizations , including a local gay rights group , responded by standing vigil outside the synagogue during Passover services . Lord and an associate were caught and convicted , and Lord was sentenced to four and a half years in prison . + On October 25 , 2002 Jacob Laskey , his brother Gabriel Laskey , Gerald Poundstone , Jesse Baker , and one other man , all members of the Volksfront , drove to Beth Israel with the intent of intimidating the congregants . While a service with 80 members attending was taking place , the men threw rocks etched with Nazi swastikas through the synagogue 's stained glass windows , then sped off . The men were caught , pleaded guilty , and were convicted . They served sentences ranging from a 6 @-@ month work release term and five years probation , to eleven years and three months in federal prison for the ringleader , Jacob Laskey . + + = = = East 29th Avenue building = = = + + Originally sized for 75 families , Temple Beth Israel 's Portland Street building had been renovated and enlarged over the years to 7 @,@ 500 square feet ( 700 m2 ) to accommodate 250 families and 150 students . Despite these additions and the loss of members to Congregation Ahavas Torah , the synagogue was not large enough , particularly during the High Holidays , when extra space had to be rented . In 1997 the congregation purchased the property of the University Street Christian Church for $ 500 @,@ 000 ( today $ 740 @,@ 000 ) , and began planning for a new facility . The members considered renovating the existing building on the property , but felt a new building would better suit their requirements , and razed the church . + In 2003 the congregation got a permit to begin construction of a new facility on the now @-@ vacant 1 @.@ 37 @-@ acre ( 0 @.@ 55 ha ) plot of land at the northwest corner of East 29th Avenue and University Street . An initial capital campaign raised more than $ 1 @.@ 8 million , which fully paid for the land , and by August 2007 an additional $ 1 @.@ 7 million had been raised towards anticipated overall project costs of $ 5 million . + The environmentally sensitive building was designed by Mel Solomon and Associates of Kansas City and local company TBG Architects & Planners , and built by McKenzie Commercial Construction of Eugene . The building used " energy efficient heating , ventilation and lighting " : specific design issues with the building 's energy efficiency included the fact that the largest room in the building , the sanctuary , was also the least @-@ used , and , in accord with Jewish tradition , had to face east ( towards Jerusalem ) . + On June 8 , 2008 the congregation dedicated its new building at 1175 East 29th Avenue . At approximately 25 @,@ 000 square feet ( 2 @,@ 300 m2 ) , the facility included a sanctuary , commercial kitchen , banquet facilities , and classrooms , and housed the synagogue , the Lane County Jewish Federation , and the local Jewish Family Service . The project ended up costing $ 6 million , of which $ 4 million had been raised . + Made of concrete , steel , and wood , the building achieved Leadership in Energy and Environmental Design compliance " through the integration of stormwater management strategies , high efficiency irrigation , the use of recycled and / or recyclable materials , and drought tolerant plantings . " Completely recyclable materials used in the structure included carpeting and wood beams . + + = = Recent events = = + + In 2008 , Temple Beth Israel participated in Banners Across America , an " interfaith witness against torture coordinated by the National Religious Campaign Against Torture , " as part of the Jewish Campaign Against Torture . Organized by Rabbis for Human Rights — North America in honor of Torture Awareness Month , the Jewish campaign included over 25 synagogues which hung banners protesting " the use of abusive interrogation techniques by the American military and intelligence community " . That year , congregational membership reached almost 400 families , and the Talmud Torah and pre @-@ school had about 200 and 40 students respectively . + The congregation sold the old synagogue building on Portland Street to Security First ( Portland Street ) Child Development Center for $ 815 @,@ 000 in 2009 , carrying the Center 's financing . The building was converted for use as an educational center , while retaining some of the original architectural elements . Difficult economic conditions forced the Child Development Center to give up the building in 2011 , and Eugene 's Network Charter School planned to move into it in autumn 2011 . + Harris announced he would be stepping down as rabbi in 2011 , and the synagogue hired Boris Dolin as his successor . Born and raised in Oregon , Dolin had worked at Temple Beth Israel as a teacher and youth group adviser from 1999 to 2001 . A graduate of the University of Oregon , with a master 's degree in Jewish Education from the Jewish Theological Seminary , he was ordained by the Reconstructionist Rabbinical College . + As of 2011 , Temple Beth Israel was the largest synagogue in Eugene . It was a member of the Community of Welcoming Congregations , " an Oregon and SW Washington interfaith ministry and advocacy organization working toward full inclusion and equality for transgender , lesbian , bisexual , gay and questioning persons . " The rabbis were Yitzhak Husbands @-@ Hankin and Boris Dolin . + + + = New York State Route 93 = + + New York State Route 93 ( NY 93 ) is a 43 @.@ 08 @-@ mile ( 69 @.@ 33 km ) state highway in western New York in the United States . The route begins at an intersection with NY 18F in the village of Youngstown and runs in a general northwest – southeast direction across Niagara and Erie counties to its east end at an intersection with NY 5 in the town of Newstead , just south of the village of Akron . NY 93 serves as a connector between several major arterials , including NY 104 in Cambria , NY 31 just west of the city of Lockport , and NY 78 south of the city . + The route was assigned as part of the 1930 renumbering of state highways in New York . Although it began in Youngstown and ended in Newstead as it does today , the initial routing of NY 93 deviated from the modern path in the vicinity of the city of Lockport . From Cambria to Lockport 's eastern suburbs , the highway originally used NY 425 , Lower Mountain Road , Akron Road , and a series of streets in Lockport . NY 93 was moved onto NY 104 and Junction Road in Cambria in the 1940s , and altered to bypass Lockport to the south on a new highway and Robinson and Dysinger roads in 1991 . In 2006 , NY 93 was realigned west of Lockport to continue south on Junction Road to NY 31 . The change removed NY 93 from Upper Mountain Road , a county @-@ owned highway that had been part of the route since the 1930s . + + = = Route description = = + + + = = = West of Lockport = = = + + NY 93 begins at an intersection with NY 18F ( Main Street ; co @-@ designated but not signed as County Route 907 or CR 907 ) in the center of the village of Youngstown . The route proceeds eastward through the village as a two @-@ lane road named Lockport Street , serving two blocks of commercial areas before bending to the northeast and passing into the residential eastern portion of Youngstown . At the eastern village limits , NY 93 briefly widens to four lanes as it enters a partial cloverleaf interchange with the Niagara Scenic Parkway . Past the junction , the highway reverts to a two @-@ lane road and changes names to Youngstown – Lockport Road as it runs across the town of Porter . The residential surroundings continue to the hamlet of Towers Corners , where NY 93 connects to NY 18 ( Creek Road ) . + After NY 18 , NY 93 curves to the southeast , serving another residential stretch ahead of a junction with Youngstown – Wilson Road ( CR 36 ) on the eastern edge of Towers Corners . After this intersection , the homes give way to farms as the road heads into rural areas of the town . The route continues on a southeast track through Porter , passing a mixture of rural and residential areas on its way into the hamlet of Porter Center , where NY 93 enters an intersection with Porter Center Road ( CR 57 ) . Another southeastward stretch brings the route across Twelvemile Creek and into the hamlet of Ransomville , where NY 93 becomes the community 's main street . Through Ransomville , NY 93 retains the Youngstown – Lockport Road name , intersecting with Ransomville Road ( CR 17 ) in the hamlet 's business district . + Just outside Ransomville , NY 93 leaves the town of Porter for the town of Wilson . It continues generally southeastward across mostly open terrain , meeting Randall Road ( CR 83 ) and Church Street ( CR 56 ) on its way to the town of Cambria . NY 93 becomes North Ridge Road at the town line , and it soon enters the hamlet of North Ridge , a community built up around the route 's intersection with NY 425 ( Cambria – Wilson Road ) . The hamlet 's residential surroundings continue to the adjacent community of Molyneaux Corners , where NY 93 becomes concurrent with NY 104 ( Ridge Road ) . NY 93 and NY 104 proceed northeast across lightly populated areas for 2 miles ( 3 @.@ 2 km ) to the hamlet of Warren Corners , at which point NY 93 splits from NY 104 and heads southward along Town Line Road . It immediately intersects with Stone Road ( CR 19 ) before leaving the hamlet . + + = = = Lockport area = = = + + Outside of Warren Corners , the route heads across rural areas along the Cambria – Lockport town line . It soon enters the small hamlet of Hickory Corners , where the road passes under Lower Mountain Road ( CR 902 ) . Access to the highway is made by way of Town Line Road Spur ( CR 114 ) , a connector leading to Lower Mountain Road . NY 93 continues southward along the town line , changing names to Junction Road at an intersection with Upper Mountain Road ( CR 5 ) west of the city of Lockport . From here , the route crosses over CSX Transportation 's Lockport Subdivision rail line at the hamlet of Lockport Junction before intersecting with NY 31 ( Saunders Settlement Road ) and NY 270 ( Campbell Boulevard ) just south of the community . NY 270 begins straight ahead to the south while NY 93 turns northeast onto Saunders Settlement Road , beginning a concurrency with NY 31 . + Now fully in the town of Lockport , NY 31 and NY 93 proceed northeast through an open area of the town as a four @-@ lane divided highway . The two routes continue to the western edge of the city of Lockport , where they intersect with Upper Mountain Road and the Lockport Bypass . The overlap ends here as NY 93 turns southeastward onto the two @-@ lane bypass . Along the bypass , NY 93 briefly enters the city limits as it runs past several industrial facilities and intersects with Hinman Road ( CR 903 ) just ahead of a bridge over the Erie Canal . Past the waterway , the bypass takes a more southerly course through an undeveloped part of the town of Lockport to a junction with Robinson Road ( CR 123 ) on the Lockport – Pendleton town line . The Lockport Bypass ends here , leaving NY 93 to turn eastward onto Robinson Road . + The route initially serves a line of homes as it heads along Robinson Road ; however , it soon enters a commercial district surrounding the road 's intersection with NY 78 ( Transit Road ) . At this point , the Lockport – Pendleton town line turns south to follow NY 78 , leaving NY 93 fully within the town of Lockport as it runs eastward past another stretch of homes . Not far from NY 78 , NY 93 changes names to Dysinger Road at an intersection with Beattie Avenue ( CR 14 ) and Raymond Road ( CR 85 ) . The junction also marks a shift in the road 's surroundings as the homes give way to open , rolling terrain . NY 93 continues eastward for several miles to the town of Royalton , where it meets Riddle Road ( CR 35 ) and Akron Road ( CR 142 ) at adjacent intersections just east of the town line . + + = = = East of Lockport = = = + + NY 93 takes over Akron Road 's name and right @-@ of @-@ way , continuing eastward past a line of scattered homes to reach the sparsely developed hamlet of Dysinger . Here , the route turns southward at a junction with Bunker Hill Road ( CR 136 ) . Outside of Dysinger , NY 93 heads southeastward across undeveloped areas of Royalton , connecting to Block Church Road ( CR 110 ) as it approaches Tonawanda Creek and the Niagara – Erie county line . The road runs along the northern edge of the creek for about 1 @.@ 5 miles ( 2 @.@ 4 km ) prior to curving southward at an intersection with Wolcottsville Road ( CR 122 ) . The turn brings NY 93 across Tonawanda Creek and into the Erie County town of Newstead , where it becomes known as Maple Road and immediately intersects with CR 260 ( Koepsel Road ) . + Continuing southward , NY 93 runs across open , rolling terrain , meeting CR 259 ( Tonawanda Creek Road ) on its way to the hamlet of Swifts Mills . Here , the rural surroundings briefly give way to residential areas as NY 93 intersects with CR 255 ( Swift Mills Road ) in the center of the community . South of Swifts Mills , the road serves only intermittent stretches of homes for 2 miles ( 3 @.@ 2 km ) , including a cluster of residences around its closely spaced intersections with CR 253 ( Carney Road ) and CR 42 ( Rapids Road ) . It continues on a southward track past the eastern terminus of CR 218 ( Hunts Corner – Akron Road ) to the outskirts of the village of Akron , where the highway turns east onto Lewis Road and soon enters the village limits . NY 93 runs past a line of homes before intersecting Cedar Street , a road maintained by Erie County as CR 261 north of the village . + The route turns south at Cedar Street , following the residential street into downtown Akron . Here , NY 93 intersects with CR 573 ( John Street ) at a junction that was once the western terminus of NY 267 . At this intersection , NY 93 heads west on John Street for one block before continuing south on Buffalo Street for another block to Main Street . NY 93 turns westward again , following Main Street through the westernmost part of Akron 's central business district prior to curving southwestward at a junction with Mechanic Street . The highway takes on the Mechanic Street name as it crosses over Murder Creek and leaves downtown Akron . Just south of the creek , NY 93 changes names to Buell Street at an intersection with Jackson Street . + As the route continues southward through the southern part of Akron , it serves mostly residential areas , save for an industrial complex at NY 93 's intersection with CR 163 ( Clarence Center Road ) and CR 167 ( Parkview Drive ) . NY 93 exits Akron a short distance south of the junction , at which point the route heads into another area of open fields while retaining the Buell Street name . It continues on a southward track for about 1 mile ( 1 @.@ 6 km ) to a commercialized intersection with NY 5 ( Main Road ) , where Buell Street and NY 93 both come to an end . + + = = History = = + + + = = = Designation and early changes = = = + + NY 93 was established as part of the 1930 renumbering of state highways in New York , connecting the cities and villages of Youngstown , Lockport , and Akron . While the termini of NY 93 have remained the same to this day , several portions of the route have been realigned since that time . When NY 93 was first assigned , it turned south at the hamlet of North Ridge and overlapped with NY 425 along Cambria – Wilson Road to Lower Mountain Road , then part of NY 3 . NY 425 went west from this junction while NY 93 headed eastward , following NY 3 along Lower Mountain , Gothic Hill , Upper Mountain , and Saunders Settlement roads to the city of Lockport . At Locust Street , NY 93 left NY 3 and exited the city along Locust , High , and Akron streets and Akron Road . It met its current alignment southeast of the city in Royalton . + NY 3 was realigned c . 1932 to follow Saunders Settlement Road between Shawnee Road ( NY 425 ) and Upper Mountain Road . The former routing of NY 3 along Shawnee , Lower Mountain , Gothic Hill , and Upper Mountain roads was redesignated as NY 3A even though all of NY 3 's former routing was already part of either NY 425 or NY 93 . The NY 3A designation was eliminated c . 1935 when NY 3 was truncated eastward to a new western terminus in central New York . In the early 1940s , NY 93 was altered to follow North Ridge Road , U.S. Route 104 ( now NY 104 ) , and Junction Road between North Ridge and Lower Mountain Road . + Around the same time that NY 93 was rerouted , NY 270 was also extended northward along Junction Road from NY 31 to US 104 . As a result , NY 93 overlapped NY 270 between Lower Mountain Road and US 104 . The overlap with NY 270 remained in place until c . 1963 when NY 270 was truncated southward to the intersection of Lower Mountain and Junction roads . NY 93 was realigned in the late 1970s to bypass Lower Mountain and Gothic Hill Roads on Junction and Upper Mountain roads , replacing NY 270 along Junction Road . The Lower Mountain Road portion of NY 93 's former routing is now maintained by Niagara County as County Route 902 ( CR 902 ) . + + = = = Lockport realignments = = = + + The Lockport Bypass , a highway bypassing downtown Lockport to the southwest , was opened to traffic on July 26 , 1991 . The highway cost $ 7 @.@ 7 million ( equivalent to $ 13 @.@ 4 million in 2016 ) to construct and extended from the junction of NY 31 and NY 93 west of the city to Robinson Road south of downtown . NY 93 was realigned to follow the new bypass south to Robinson Road , where it turned east and followed Robinson Road ( CR 123 ) and Dysinger Road ( CR 133 ) to Akron Road in Royalton . The portion of Akron Road ( NY 93 's former routing ) east of the Lockport city limits became NY 954M , an unsigned reference route . + Ownership and maintenance of Robinson Road from the bypass to NY 78 was transferred from Niagara County to the state of New York on September 1 , 1990 , as part of a highway maintenance swap between the two levels of government . The portion of NY 93 between NY 78 and Akron Road became state @-@ maintained on October 1 , 1998 , as part of another swap that also transferred ownership and maintenance of Akron Road to Niagara County . Akron Road is now CR 142 . + On November 1 , 2005 , the Niagara County Legislature voted on a measure to allow the county to ask the New York State Department of Transportation ( NYSDOT ) to remove the NY 93 designation from Upper Mountain Road , a county @-@ maintained highway , and reassign it to Junction Road ( NY 270 ) and Saunders Settlement Road ( NY 31 ) . The impetus for the change came from a resident of Upper Mountain Road , who demanded that trucks should be removed from the roadway . This part of the agenda was passed . NYSDOT obliged to the request in 2006 , rerouting NY 93 as proposed and truncating NY 270 southward to NY 31 . + + = = Major intersections = = + + + + = Operation USA = + + Operation USA ( OpUSA , Operation California , or OpCal ) is a non profit humanitarian organization dedicated to helping communities alleviate the effects of disaster , disease , and endemic poverty throughout the world by providing privately funded relief , reconstruction , humanitarian aid and development aid . It is exclusively privately funded , receiving no assistance from the United States Federal Government . OPUSA had a revenue of over $ 22 million in fiscal year 2012 and has shipped over $ 425 million worth of " high @-@ priority medical , nutritional and shelter supplies " since its inception , including shipments to Haiti , Japan , Chile , Kenya and Pakistan in 2011 and 2011 . + + = = Awards and affiliations = = + + Operation USA was part of the International Campaign to Ban Landmines in 1997 when it won the Nobel Peace Prize . Operation California was also the winner of the 1983 President 's Volunteer Action Award . Operation USA has been named one of America 's Best 100 Charities by Worth Magazine and , in October 2008 , was named the top @-@ rated " exclusively privately funded charity in the U.S. " by Charity Navigator . Operation USA collaborated with NASA 's Jet Propulsion Laboratory and the US National Laboratories at Lawrence Livermore and Los Alamos to develop new approaches to land mine detection , is a member of InterAction , and is an AlertNet news partner . In 2014 Operation USA 's CEO Richard M. Walden received the Honeywell Hometown Hero Award from the Honeywell Corp. + + = = History = = + + Operation California began in 1979 as " a relief organization created to provide aid to Vietnamese Boat People and Cambodian refugees " , founded by Richard Walden ( still active as President & CEO ) and Llewellyn Werner ( who left in early 1980 ) . The organization flew " the first international relief airlift to Cambodia since 1975 " , delivering medicine to Phnom @-@ Penh . Operation California had airlifted more than $ 3 million worth of aid by October 1979 . + Since then , Operation USA has become a highly acclaimed aid organization that is involved in helping people in different ways around the world . In 1982 , Operation California sent " the first private airlift from the U.S. to Poland " , delivering 200 @,@ 000 lbs. of medical supplies and medicine ; that year Operation California also airlifted medical supplies to Lebanon . In 1983 , Operation California delivered aid to the children of Vietnam and Cambodia . Operation California provided aid to the earthquake victims in Mexico City in 1985 , as well as working in cooperation with the Unitarian Universalist Service Committee and Oxfam America , to deliver $ 250 @,@ 000 worth of medical aid to Nicaragua . In 1986 Operation California , in conjunction with Medical Aid to El Salvador , sent " [ t ] wo cargo planes carrying $ 500 @,@ 000 worth of relief supplies to earthquake @-@ stricken El Salvador " . + In 1988 , Operation California began using the name Operation USA because it better described the effort and intent of the organization to represent the entire American people . In 1989 Operation USA facilitated operations on children in Vietnam who had cleft palates by a Los Angeles @-@ based plastic surgeon , Dr Stanley Frileck . Medical aid effort was delivered to Mexico in 1990 , by OPUSA in conjunction with USSR relief workers . In 1991 OPUSA delivered aid to Bangladesh . OpUSA delivered aid to war torn Somali 's in 1993 . In 1994 OpUSA provided earthquake relief . In 1995 the organization provided aid to Hurricane Mitch survivors in Honduras and Nicaragua . In 1999 OpUSA supplied aid to storm victims in Mexico . In 2003 OpUSA delivered aid to Iraq War victims in the Persian Gulf . The tsunami victims in Sri Lanka and Indonesia were aided by OpUSA in 2004 , as well as the Mexico City Flood victims . + In 2008 , OpUSA has delivered aid to Myanmar cyclone victims as well as Chinese earthquake victims and flood victims in the Midwest , USA . + In 2015 , OpUsa partners with UniversalGiving to raise fund for its project , which is to deliver recovery aid to Nepal Earthquake victims . + + = = Celebrity affiliates = = + + Operation USA , since the early 1980s , has relied on fundraising efforts featuring singers and celebrities . These include concerts , dinners , and other events . These promotions have featured : + Barbra Streisand + Bonnie Raitt + Carol Burnett + Crosby , Stills & Nash + Don Henley + Ed Asner + Frank Sinatra + Jack Elliot + Jackson Browne + James Garner + John Denver + Julie Andrews + Kirk Douglas + Michael Jackson + New American Orchestra + Plácido Domingo + Ricardo Montalban + Ry Cooder + Sharon Stone + The Buena Vista Social Club + Tony Adams + Rosario Dawson travelled with Operation USA to Nicaragua in 2008 . George Hamilton assisted with relief to The Philippines Typhoon Haiyan in 2013 as did Barbra Streisand , Rosario Dawson , Jackson Browne , Bill Maher and Judd Apatow . + + = = Film and theater projects = = + + Operation USA also relies on film and theater promotions to generate funds that pay for aid , including : + Because We Care ( CBS Television Special ) + Beyond Borders ( Hollywood ) + Buena Vista Social Club ( film ) ( Hollywood & Havana ) + Fidel ( film ) ( Hollywood for Showtime ) + Mary Poppins ( musical ) ( London stage ) + Miss Saigon ( Hollywood ) + Roll Bounce + The Killing Fields ( film ) ( Hollywood & Cambodia ) + Victor / Victoria ( Broadway ) + Eloise at The Plaza ( Disney TV ) + + + = Typhoon Krosa ( 2013 ) = + + Typhoon Krosa , known in the Philippines as Typhoon Vinta , was a typhoon that made landfall in the northern Philippines in late October 2013 . Forming on October 27 near Guam , the storm slowly intensified while moving westward . Krosa developed an eye and became a typhoon before striking Luzon on October 31 . The storm weakened over land , but re @-@ intensified over the South China Sea , reaching peak winds of 150 km / h ( 90 mph ) on November 2 off the southeast coast of China . Typhoon Krosa stalled and encountered unfavorable conditions , resulting in quick weakening . By November 3 , it had weakened to tropical storm status , and was no longer being warned on by the next day . In northern Luzon , Krosa damaged 32 @,@ 000 houses , including 3 @,@ 000 that were destroyed , and caused four fatalities . High winds and rainfall left P277 million ( PHP , $ 6 @.@ 4 million USD ) in damage . + + = = Meteorological history = = + + On October 27 , an area of convection with a broad circulation persisted southeast of Guam , and slowly consolidated due to moderate wind shear and westerly outflow . That day , the Japan Meteorological Agency ( JMA ) classified the system as a tropical depression about 380 km ( 235 mi ) to the southeast of Hagåtña , Guam . At 2100 UTC on October 28 , the Philippine Atmospheric , Geophysical and Astronomical Services Administration ( PAGASA ) began issuing advisories on the depression , giving it the local name Vinta . The next day , the JMA upgraded the depression to Tropical Storm Krosa ( 1329 ) , and the Joint Typhoon Warning Center ( JTWC ) also classified it as Tropical Depression 29W . By that time , the storm was moving steadily westward due to the subtropical ridge to the north . With the warm waters of the Philippine Sea , lessening wind shear , and improving outflow , Krosa gradually strengthened , and the JTWC also upgraded Krosa to tropical storm status on October 30 after an eye feature developed . + While approaching northern Luzon on October 29 , Krosa quickly intensified as the initial eye feature organized into a well @-@ defined eye . Late on October 30 , the JTWC upgraded Krosa to typhoon status , and the next day , both PAGASA and JMA followed suit . On October 31 , Krosa made landfall in northeastern Luzon near Cagayan , and developed a symmetric eyewall while initially moving over land . Land interaction weakened the eye by the time Krosa emerged into the South China Sea late on October 31 . The next day , PAGASA discontinued advisories after the typhoon exited the region . Convection rebuilt around the center , with continued favorable conditions allowing for restrengthening . + Late on November 1 , a large eye redeveloped , and the JTWC estimated Krosa attained peak 1 minute sustained winds of 185 km / h ( 115 mph ) . Early the next day , the JMA also estimated the typhoon reached peak 10 minute winds of 140 km / h ( 85 mph ) . Later , increasing wind shear caused the eye to deteriorate , and Krosa began slowing about 260 km ( 160 mi ) east @-@ southeast of Hong Kong after reaching the western edge of the subtropical ridge . After remaining nearly stationary , Krosa began moving steadily to the west @-@ southwest due to a new ridge . The convection continued to weaken due to continued shear and cooler waters from upwelling , and Krosa deteriorated to tropical storm status on November 3 . The next day , the JTWC issued its final advisory after the circulation became exposed from the convection . Also on November 4 , the JMA downgraded Krosa to tropical depression status off the northeast Vietnam coast . The system dissipated at 0000 UTC on November 5 . + + = = Preparations and impact = = + + Before Krosa struck the Philippines , PAGASA issued a number 3 warning signal for portions of northern Luzon , where winds were expected to reach over 100 km / h ( 60 mph ) . The agency noted for the potential for flooding and landslides . High winds knocked down trees across Luzon , and left about 80 % of Cagayan province without power , as well as some areas without internet or cellphone service . Portions of the Pan @-@ Philippine Highway were blocked , and in Lal @-@ Lo , Cagayan , a car crashed into a gasoline truck due to power outages . Agriculture damage was estimated at P273 million ( PHP , $ 6 @.@ 3 million USD ) , occurring just before the start of the harvest . Across the island , the typhoon damaged 32 @,@ 745 houses , including 3 @,@ 837 that were destroyed , forcing 65 @,@ 648 people to evacuate to storm shelters . Overall , Krosa killed four people in the Philippines , and left P273 million ( PHP , $ 6 @.@ 4 million USD ) in damage . After the storm , workers quickly restored power lines , while the government provided monetary assistance to storm @-@ ravaged families , after Cagayan was declared a state of calamity . Members of the Philippine military and Department of Public Works and Highways worked to clean up following the storm . + The China National Meteorological Centre issued a " yellow alert " , the second @-@ lowest of the four level warning system , for Hainan due to the threat of the storm . The agency recommended boats to return to port . Agencies in Vietnam also warned for the potential of heavy rainfall due to the dissipating Tropical Depression Krosa , and released water from three dams to prevent overflow . + + + = Kirby 's Block Ball = + + Kirby 's Block Ball is a 1995 action video game , a spin @-@ off from the Kirby series for the Game Boy portable console . It is a Breakout clone ; the player controls paddles along the screen 's edge to knock a bouncing ball , Kirby , into destructible bricks . The game 's 55 levels include power @-@ ups , bonus rounds , and minigames . Kirby 's Block Ball was developed by HAL Laboratory and Nintendo R & D1 . The team spent half a year revising the gameplay to match Kirby 's signature characteristics . Kirby 's Block Ball was published by Nintendo first in Japan in 1995 , later in Europe , and last in North America in 1996 . + Reviewers considered the game an improvement on the Breakout formula and praised its gameplay craftsmanship and incorporation of the Kirby series . It was included in multiple top Game Boy game lists and was later emulated on the Nintendo 3DS Virtual Console . + + = = Gameplay = = + + The player controls paddles along the screen 's edges to knock a bouncing ball , Kirby , into destructible bricks . The player loses a life if Kirby hits the edge of the screen . Each of the game 's eleven stages include five rounds of increasingly complex block patterns for Kirby to clear . The ten different block types vary in durability and points value . A well @-@ timed hit of the paddle gives Kirby a powerful bounce to break through harder blocks . Another block type turns the remaining blocks into a bonus round that rewards the player for clearing the screen in the least amount of time . The player can find warp stars that lead to minigames , such as air hockey , where the player can earn extra lives . The rounds also include enemies to attack and avoid . Some enemies contain bonus items . Each stage ends in a boss fight . + With stone , needle , flame , and spark power @-@ ups , Kirby can transform to interact with blocks differently . For instance , the spark power @-@ up lets Kirby break through otherwise indestructible blocks , and the needle lets Kirby hit spikes once without losing a life . The game has a themed frame and uses a wide palette of colors in @-@ game when played with the Super Game Boy . + + = = Development = = + + The game was developed by HAL Laboratory with Gunpei Yokoi 's Nintendo R & D1 , and published by Nintendo . At one point in development , HAL decided that the game did not feel like a Kirby game . The team spent six months completely revising the game under explicit instructions on how Kirby should move . Kirby games contain elements of unrestricted , creative movement as a general theme . Kirby 's Block Ball was released for the Game Boy first in Japan in 1995 and later in Europe ( 1995 ) and North America ( May 1996 ) . It was later emulated on the Nintendo 3DS Virtual Console , and released first in Japan ( October 2011 ) and later in Europe ( February 2012 ) and North America ( May 2012 ) . + + = = Reception and legacy = = + + On release , the four reviewers of Electronic Gaming Monthly applauded Kirby 's Block Ball for modifying the Breakout formula to create a new and enjoyable game . They especially praised the unique power @-@ ups , though Crispin Boyer and Sushi X also felt the game was too short and easy . Nintendo Power said they enjoyed Block Ball and its number of stages , but wondered how its eight megabits of memory were being used . The magazine found the parts where Kirby eats the unbreakable blocks to be innovative . All six of the magazine 's reviewers recommended the game . + IGN wrote that the game was primarily remembered as " an Arkanoid or Breakout clone skinned with the Kirby franchise " . IGN calculated an average reviewer score of 7 @.@ 4 / 10 . The Kirby series became known for its number of non @-@ platformer spin @-@ offs , of which Block Ball was one , like Kirby 's Pinball Land and Kirby 's Dream Course . Kirby 's spherical shape lent itself towards ball @-@ like roles . IGN wrote that Block Ball was the first " truly out there " Kirby spin @-@ off , but that the game was too short . + Planet Game Boy called it one of the original Game Boy 's ten " all @-@ time classics " and GamesRadar placed it among the top 25 Game Boy games released . They considered Kirby 's Block Ball an improvement upon Alleyway , a Game Boy launch title and Breakout clone . IGN recommended the game upon its 3DS rerelease both in general and for Breakout fans . Nintendo World Report recommended the game to players who like score attack games and called it the best version of Breakout released . Retrospective reviewers found the game enjoyable and praised the craft behind the gameplay and Kirby themes . Alternatively , Kirby 's Block Ball received the lowest rating on Tim Rogers 's 2004 " Yamanote Scoring System for Portable Games " ( a metric by which he played a game while counting stops on the circular Yamanote train line until he lost interest ) with a score of " one " stop . He called it " too damned bland " . + In a retrospective review , Jeuxvideo.com had high praise for the level design , graphics , and animations . They also found the music excellent in comparison to the annoying and repetitive soundtrack of most Breakout clones . The magazine also liked how the game fit the Kirby universe , apart from its increased difficulty — Jeuxvideo.com occasionally had trouble hitting the slow @-@ paced ball with precision . + + + = Hannah Dodd = + + Hannah Dodd ( born 27 April 1992 ) is an Australian Grade IV equestrian and 2 @.@ 0 point wheelchair basketball player who represented Australia in equestrian at the 2012 Summer Paralympics in London , coming 11th and 12th in her events . Switching to wheelchair basketball , she made her debut with the national team at the Osaka Cup in February 2015 . + In 2008 , Dodd was the Australian national Grade IV para @-@ equestrian champion . She was runner @-@ up in 2009 , and won the Australian national championships again in 2011 , along with the Oceania Championships and the National Titles team events . By 2012 , she was the top @-@ ranked Australian competitor in her event and class . + After the London Paralympics , Dodd took up wheelchair basketball . She started playing for the Sydney University Flames in the Women 's National Wheelchair Basketball League in 2013 , made her debut with the national team at the Osaka Friendship Games in Osaka in February 2015 , winning bronze , and was part of the Under 25 team at the 2015 Women 's U25 Wheelchair Basketball World Championship in Beijing in July 2015 , winning silver . + + = = Personal = = + + Hannah Dodd was born on 27 April 1992 , and is from Arcadia , New South Wales . She has sacral agenesis and spina bifida with upper limb dystonia , and is missing four vertebrae in her back . When she was about a year old , her kidneys started failing . Her entire renal system needed to be reconstructed . She has two older brothers . She can walk with the aide of a caliper , and also uses a wheelchair . As of 2012 , she is a horse riding teacher and student at the University of Western Sydney where she is majoring in sports and exercise science . + + = = Equestrian = = + + Dodd is a Grade IV equestrian competitor , coached by Peter Turner . Due to her sacral agenesis , when she rides her horse , she dislocates several bones every time , but as a result of anti @-@ doping rules , she has had to find alternative ways of coping with pain associated with riding . + Dodd has been around horses since she was four months old , and was able to ride on her own by the time she was two years old , before she learned to walk . The sport gave her a degree of independence . She started competing in 2005 , and first represented Australia in 2006 , winning her first test in England that year.In 2008 , she became the youngest @-@ ever winner of the Australian national championships . She finished first at the March 2009 inter @-@ schools cup at the St Ives Showground , and second at the 2009 Australian national championships , but her horse , Lucifer 's Dream , was injured in 2009 . In 2009 and 2010 , she searched for another horse to assist her in getting through Paralympic qualification . She won the Australian national championships again in 2011 , along with the Oceania Championships and the National Titles team events . By 2012 , she was the top @-@ ranked Australian competitor in her event and class . + Dodd was selected to represent Australia at the 2012 Summer Paralympics in London in equestrian events with her horse Waikiwi . These Games were her first , and she was the youngest Australian equestrian competitor . A fund raiser was organised by Arcadia , New South Wales , residents . While her own costs and the cost of her horse were covered by Australian Paralympic Committee and Equestrian Australia , funds were required for her coach . She was placed 12th in the Individual Championship Test – Grade IV , and 11th in the Individual Freestyle Test – Grade IV and Team Test – Grade IV . + + = = Wheelchair basketball = = + + After the London Paralympics , Dodd took up wheelchair basketball . She started for the Sydney University Flames in the Women 's National Wheelchair Basketball League in 2013 . She has to strap her fingers and wrists , and usually dislocates a shoulder during a game . " I 've had a few bangs and scrapes and been tipped out of my chair a few times , " she concedes , " but it 's really fun . The fast pace really gives you an adrenalin kick and the girls I play with are awesome . " " If I have chose between my two sports for Rio , " she said , " I will go with basketball . " She made her debut with the national team , known as the Gliders , at the Osaka Cup in Osaka in February 2015 . The Gliders won bronze . In June 2015 , Dodd was selected as part of the under 25 team ( known as the Devils ) for the 2015 Women 's U25 Wheelchair Basketball World Championship in Beijing in July . The Devils won silver . By this time her health had deteriorated . She had to use a wheelchair much of the time , and her classification had dropped to a 2 @.@ 5 point player . In 2015 , she was reclassified a 2 @.@ 0 . + + + = Commonwealth War Graves Commission = + + The Commonwealth War Graves Commission ( CWGC ) is an intergovernmental organisation of six independent member states whose principal function is to mark , record and maintain the graves and places of commemoration of Commonwealth of Nations military service members who died in the two World Wars . The Commission is also responsible for commemorating Commonwealth civilians who died as a result of enemy action during World War II . The Commission was founded by Fabian Ware and constituted through Royal Charter in 1917 named the Imperial War Graves Commission . The change to the present name took place in 1960 . + The Commission , as part of its mandate , is responsible for commemorating all Commonwealth war dead individually and equally . To this end , the war dead are commemorated by name on a headstone , at an identified site of a burial , or on a memorial . War dead are commemorated uniformly and equally , irrespective of military or civil rank , race or creed . + The Commission is currently responsible for the continued commemoration of 1 @.@ 7 million deceased Commonwealth military service members in 153 countries . Since its inception , the Commission has constructed approximately 2 @,@ 500 war cemeteries and numerous memorials . The Commission is currently responsible for the care of war dead at over 23 @,@ 000 separate burial sites and the maintenance of more than 200 memorials worldwide . In addition to commemorating Commonwealth military service members , the Commission maintains , under arrangement with applicable governments , over 40 @,@ 000 non @-@ Commonwealth war graves and over 25 @,@ 000 non @-@ war military and civilian graves . The Commission operates through the continued financial support of the member states : United Kingdom , Canada , Australia , New Zealand , India and South Africa . The current President of the Commonwealth War Graves Commission is Prince Edward , Duke of Kent . + + = = History = = + + + = = = World War I = = = + + On the outbreak of World War I in 1914 , Fabian Ware , a director of the Rio Tinto Company , found that at 45 years old he was too old to join the British Army . He used the influence of Rio Tinto chairman , Viscount Milner , to become the commander of a mobile unit of the British Red Cross . He arrived in France in September 1914 and whilst there was struck by the lack of any official mechanism for documenting or marking the location of graves of those who had been killed and felt compelled to create an organisation within the Red Cross for this purpose . In March 1915 , with the support of Nevil Macready , Adjutant @-@ General of the British Expeditionary Force , Ware 's work was given official recognition and support by the Imperial War Office and the unit was transferred to the British Army as the Graves Registration Commission . The new Graves Registration Commission had over 31 @,@ 000 graves of British and Imperial soldiers registered by October 1915 and 50 @,@ 000 registered by May 1916 . + When municipal graveyards began to overfill Ware began negotiations with various local authorities to acquire land for further cemeteries . Ware began with an agreement with France to build joint British and French cemeteries under the understanding that these would be maintained by the French government . Ware eventually concluded that it was not prudent to leave the maintenance responsibilities solely to the French government and subsequently arranged for France to purchase the land , grant it in perpetuity , and leave the management and maintenance responsibilities to the British . The French government agreed under the condition that cemeteries respected certain dimensions , were accessible by public road , were in the vicinity of medical aid stations and were not too close to towns or villages . Similar negotiations were started with the Belgian government . + As reports of the grave registration work became public , the Commission began to receive letters of enquiry and requests for photographs of graves from relatives of deceased soldiers . By 1917 , 17 @,@ 000 photographs had been dispatched to relatives . In March 1915 , the Commission , with the support of the Red Cross , began to dispatch photographic prints and cemetery location information in answer to the requests . The Graves Registration Commission became the Directorate of Graves Registration and Enquiries in the spring of 1916 in recognition of the fact that the scope of work began to extend beyond simple grave registration and began to include responding to enquiries from relatives of those killed . The directorate 's work was also extended beyond the Western Front and into other theatres of war , with units deployed in Greece , Egypt and Mesopotamia . + + = = = Formal establishment = = = + + As the war continued , Ware and others became concerned about the fate of the graves in the post @-@ war period . Following a suggestion by the British Army , the National Committee for the Care of Soldiers ' Graves was appointed by the British government in January 1916 , with Edward , Prince of Wales agreeing to serve as president . The National Committee for the Care of Soldiers ' Graves was created with the intention of taking over the work of the Directorate of Graves Registration and Enquiries after the war . The government felt that it was more appropriate to entrust the work to a specially appointed body rather than to any existing government department . By early 1917 a number of members of the committee believed a formal imperial organisation would be needed to care for the graves . With the help of Edward , Prince of Wales , Ware submitted a memorandum to the Imperial War Conference in 1917 suggesting that an imperial organisation be constituted . The suggestion was accepted and on 21 May 1917 the Imperial War Graves Commission was established by Royal Charter , with the Prince of Wales serving as president , Secretary of State for War Lord Derby as chairman and Ware as vice @-@ chairman . The Commission 's undertakings began in earnest at the end of the First World War . Once land for cemeteries and memorials had been guaranteed , the enormous task of recording the details of the dead could begin . By 1918 , some 587 @,@ 000 graves had been identified and a further 559 @,@ 000 casualties were registered as having no known grave . + The scale , and associated high number of casualties , of the war produced an entirely new attitude towards the commemoration of war dead . Previous to World War I , individual commemoration of war dead was often on an ad hoc basis and was almost exclusively limited to commissioned officers . However , the war required mobilisation of a significant percentage of the population , either as volunteers or through conscription . An expectation had consequently arisen that individual soldiers would expect to be commemorated , even if they were low @-@ ranking members of the military . A committee under Frederic Kenyon , Director of the British Museum , presented a report to the Commission in November 1918 detailing how it envisioned the development of the cemeteries . Two key elements of this report were that bodies should not be repatriated and that uniform memorials should be used to avoid class distinctions . Beyond the logistical nightmare of returning home so many corpses , it was felt that repatriation would conflict with the feeling of brotherhood that had developed between serving ranks . + An article in The Times on 17 February 1919 by Rudyard Kipling carried the Commission 's proposal to a wider audience and described what the graves would look like . The article entitled War Graves : Work of Imperial Commission : Mr. Kipling 's Survey was quickly republished as an illustrated booklet , Graves of the Fallen . The illustrated booklet was intended to soften the impact of Kenyon 's report as it included illustrations of cemeteries with mature trees and shrubs ; contrasting the bleak landscapes depicted in published battlefield photos . There was an immediate public outcry following the publication of the reports , particularly with regards to the decision to not repatriate the bodies of the dead . The reports generated considerable discussion in the press which ultimately led to a heated debate in Parliament on 4 May 1920 . Sir James Remnant started the debate , followed by speeches by William Burdett @-@ Coutts in favour of the Commission 's principles and Robert Cecil speaking for those desiring repatriation and opposing uniformity of grave markers . Winston Churchill closed the debate and asked that the issue not proceed to a vote . Remnant withdrew his motion , allowing the Commission to carry out its work assured of support for its principles . + + = = = First cemeteries and memorials to the missing = = = + + Three of the most eminent architects of their day , Sir Herbert Baker , Sir Reginald Blomfield , and Sir Edwin Lutyens were commissioned to design the cemeteries and memorials . Rudyard Kipling was appointed literary advisor for the language used for memorial inscriptions . + In 1920 , the Commission built three experimental cemeteries at Le Treport , Forceville and Louvencourt , following the principles outlined in the Kenyon report . Of these , the Forceville Communal Cemetery and Extension was agreed to be the most successful . Having consulted with garden designer Gertrude Jekyll , the architects created a walled cemetery with uniform headstones in a garden setting , augmented by Blomfield 's Cross of Sacrifice and Lutyens ' Stone of Remembrance . After some adjustments , Forceville became the template for the Commission 's building programme . Adjustments were required because all three experimental cemeteries went over budget . To ensure future cemeteries remained within their budget the Commission decided to not build shelters in cemeteries that contained less than 200 graves , to not place a Stone of Remembrance in any cemetery with less than 400 graves , and to limit the height of cemetery walls to 1 metre ( 3 @.@ 3 ft ) . + At the end of 1919 , the Commission had spent £ 7 @,@ 500 , and this figure rose to £ 250 @,@ 000 in 1920 as construction of cemeteries and memorials increased . By 1921 , the Commission had established 1 @,@ 000 cemeteries which were ready for headstone erections , and burials . Between 1920 and 1923 , the Commission was shipping 4 @,@ 000 headstones a week to France . In many cases small cemeteries were closed and the graves concentrated in larger ones . By 1927 , when the majority of construction had been completed , over 500 cemeteries had been built , with 400 @,@ 000 headstones , a thousand Crosses of Sacrifice , and 400 Stones of Remembrance . + The Commission had also been mandated to individually commemorate each soldier who had no known grave , which amounted to 315 @,@ 000 in France and Belgium alone . The Commission initially decided to build 12 monuments on which to commemorate the missing ; each memorial being located at the site of an important battle along the Western Front . After resistance from the French committee responsible for the approvals of memorials on French territory , the Commission revised their plan and reduced the number of memorials , and in some cases built memorials to the missing in existing cemeteries rather than as separate structures . + Reginald Blomfield 's Menin Gate was the first memorial to the missing located in Europe to be completed , and was unveiled on 24 July 1927 . The Menin Gate ( Menenpoort ) was found to have insufficient space to contain all the names as originally planned and 34 @,@ 984 names of the missing were instead inscribed on Herbert Baker 's Tyne Cot Memorial to the Missing . Other memorials followed : the Helles Memorial in Gallipoli designed by John James Burnet ; the Thiepval Memorial on the Somme and the Arras Memorial designed by Edwin Lutyens ; and the Basra Memorial in Iraq designed by Edward Prioleau Warren . The Dominions and India also erected memorials on which they commemorated their missing : the Neuve @-@ Chapelle Memorial for the forces of India , the Vimy Memorial by Canada , the Villers @-@ Bretonneux Memorial by Australia , the Delville Wood Memorial by South Africa and the Beaumont @-@ Hamel Memorial by Newfoundland . The programme of commemorating the dead of the Great War was considered essentially complete with the inauguration of the Thiepval Memorial in 1932 , though the Vimy Memorial would not be finished until 1936 , the Villers @-@ Bretonneux Memorial until 1938 and stonemasons were still conducting work on the Menin Gate when Germany invaded Belgium in 1940 . + The only memorial created by the Commission that was not in the form of a monument or cemetery was the Opththalmic Institute at Giza , Egypt — complete with library , and bacteriology and pathology departments — as its memorial to men of the Egyptian Labour Corps and Camel Transport Corps . Its erection was agreed with local political pressure . + + = = = World War II = = = + + From the start of the Second World War in 1939 , the Commission organised grave registration units and , planning ahead based on the experience gained from the First World War , earmarked land for use as cemeteries . When the war began turning in favour of the Allies , the Commission was able to begin restoring its First World War cemeteries and memorials . It also began the task of commemorating the 600 @,@ 000 Commonwealth casualties from the Second World War . In 1949 , the Commission completed Dieppe Canadian War Cemetery , the first of 559 new cemeteries and 36 new memorials . Eventually , over 350 @,@ 000 new headstones were erected . Many were made from Hopton Wood stone . The wider scale of World War II , coupled with manpower shortages and unrest in some countries , meant that the construction and restoration programmes took much longer . Following the war , the Commission implemented a five @-@ year horticultural renovation programme . The horticultural neglect was largely addressed by 1950 but there were necessary structural repairs to be made . These , together with the backlog of maintenance tasks from before the war , took a further 10 years to complete and the programme was not completed until the 1960s . + With the increased number of civilian casualties compared with the World War I , Winston Churchill agreed to Ware 's proposal that the Commission also maintain a record of Commonwealth civilian war deaths . A supplemental chapter was added to the Imperial War Graves Commission 's charter on 7 February 1941 , empowering the organisation to collect and record the names of civilians who died from enemy action during the Second World War , which resulted in the creation of the Civilian War Dead Roll of Honour . The roll eventually contained the names of nearly 67 @,@ 000 civilians . The Commission and the Dean of Westminster reached an agreement that the roll would eventually be placed in Westminster Abbey but not until the roll was complete and hostilities had ended . The Commission handed over the first six volumes to the Dean of Westminster on 21 February 1956 ; the final volume was added to the showcase in 1958 . + + = = = Post – World War II = = = + + Following World War II the Commission recognised that the word ' Imperial ' within its name was no longer appropriate . In the spirit of strengthening national and regional feelings the organisation 's name was changed to Commonwealth War Graves Commission in 1960 . + More recent conflicts have sometimes made it impossible for the Commission to care for cemeteries in a given region or resulted in the destruction of sites altogether . Zehrensdorf Indian Cemetery in Germany was unkempt after the end of World War II and until the German reunification because it was located in an area occupied by Russian forces and was not entirely rebuilt until 2005 . The Six @-@ Day War and War of Attrition resulted in the destruction of Port Tewfik Memorial and Aden Memorial , and the death of a Commission gardener at Suez War Memorial Cemetery . During the Lebanese Civil War two cemeteries in Beirut were destroyed and had to be rebuilt . The maintenance of war graves and memorials in Iraq has remained difficult since Iran – Iraq War in the 1980s , with regular maintenance being impractical since after the Gulf War . + The Commission has , and continues to , also provide support for war graves outside its traditional mandate . In 1982 , the British Ministry of Defence requested the Commission 's assistance to design and construct cemeteries in the Falkland Islands for those killed during the Falklands War . Although these cemeteries are not Commonwealth War Graves Commission cemeteries , the Commission manages the administrative responsibilities of these cemeteries . Since 2005 , the Commission has carried out similar management duties on behalf of the British Ministry of Defence for cemeteries and graves of British and Imperial soldiers who died during the Second Boer War . In 2003 , Veterans Affairs Canada employed the Commission to develop an approach to locate grave markers for which the Canadian Minister of Veterans Affairs has responsibility . As of 2011 , the Commission conducts a twelve @-@ year cyclical inspection programme of Canadian veterans ' markers installed at the expense of the Government of Canada . + In 2008 , an exploratory excavation discovered mass graves on the edge of Pheasant Wood outside of Fromelles . Two @-@ hundred and fifty British and Australian bodies were excavated from five mass graves which were interred in the newly constructed Fromelles ( Pheasant Wood ) Military Cemetery . This was the first new Commonwealth War Graves Commission cemetery in more than 50 years , the last such cemeteries having been built after the Second World War . + + = = Burial sites and memorials = = + + The Commission is currently responsible for the continued commemoration of 1 @.@ 7 million deceased Commonwealth military service members in 153 countries and approximately 67 @,@ 000 civilians who died as a result of enemy action during World War II . Commonwealth military service members are commemorated by name on either a headstone , at an identified site of a burial , or on a memorial . As a result , the Commission is currently responsible for the care of war dead at over 23 @,@ 000 separate burial sites and maintenance of more than 200 memorials worldwide . The vast majority of burial sites are pre @-@ existing communal or municipal cemeteries and parish churchyards located in the United Kingdom , however the Commission has itself constructed approximately 2 @,@ 500 war cemeteries worldwide . The Commission has also constructed or commissioned memorials to commemorate the dead who have no known grave ; the largest of these is the Thiepval Memorial . + + = = = Qualifications for inclusion = = = + + The Commission only commemorates those who have died during the designated war years , while in Commonwealth military service or of causes attributable to service . The applicable periods of consideration are 4 August 1914 to 31 August 1921 for the First World War and 3 September 1939 to 31 December 1947 for the Second World War . The end date for the First World War period is the official end of the war , while for the Second World War the Commission selected a date approximately the same period after VE Day as the official end of the First World War was after the 1918 Armistice . + Civilians who died as a result of enemy action during the Second World War are commemorated differently from those that died as a result of military service . They are commemorated by name through the Civilian War Dead Roll of Honour located in St George 's Chapel in Westminster Abbey . In addition to its mandated duties , the Commission maintains , under arrangement with applicable governments , over 40 @,@ 000 non @-@ Commonwealth war graves and over 25 @,@ 000 non @-@ war military and civilian graves . + + = = = Architects and sculptors = = = + + As well as the main Principal Architects for France and Belgium ( Baker , Blomfield and Lutyens ) , there were Principal Architects appointed for other regions as well . Sir Robert Lorimer was Principal Architect for Italy , Macedonia and Egypt , while Sir John James Burnet was Principal Architect for Palestine and Gallipoli , assisted by Thomas Smith Tait . The Principal Architect for Mesopotamia was Edward Prioleau Warren . + As well as these senior architects , there was a team of Assistant Architects who were actually responsible for many of the cemetery and memorial designs . These architects were younger , and many of them had served in the war . The Assistant Architects were : George Esselmont Gordon Leith , Wilfred Clement von Berg , Charles Henry Holden ( who in 1920 became a Principal Architect ) , William Harrison Cowlishaw , William Bryce Binnie , George Hartley Goldsmith , Frank Higginson , Arthur James Scott Hutton , Noel Ackroyd Rew , and John Reginald Truelove . Other architects that worked for the Commission , or won competitions for the Commission memorials , included George Salway Nicol , Harold Chalton Bradshaw , Verner Owen Rees , Gordon H. Holt , and Henry Philip Cart de Lafontaine . + In January 1944 , Edward Maufe was appointed Principal Architect for the UK . Maufe worked extensively for the Commission for 25 years until 1969 , becoming Chief Architect and also succeeding Kenyon as Artistic Advisor . Together with Maufe , the other Principal Architects appointed during and after the Second World War were Hubert Worthington , Louis de Soissons , Philip Hepworth and Colin St Clair Oakes . + Leading sculptors that worked on the memorials and cemeteries after the First World War included Eric Henri Kennington , Charles Thomas Wheeler , Gilbert Ledward , and Charles Sargeant Jagger . Other sculptors , both in the inter @-@ war period and after the Second World War , included William Reid Dick , Ernest Gillick , Basil Gotto , Alfred Turner , Laurence A. Turner , Walter Gilbert , Henry Poole , Vernon Hill , Robert Anning Bell , Ferdinand Victor Blundstone , Joseph Armitage , and Gilbert Bayes . + + = = = Cemetery design = = = + + + = = = = Common architectural design features = = = = + + Structural design has always played an important part in the Commission 's cemeteries . Apart from a few exceptions , due to local geological conditions , the cemeteries follow the same design and uniform aesthetic all over the world . This makes the cemeteries easily recognisable and distinguishes them from war graves administered by other groups or countries . + A typical cemetery is surrounded by a low wall or hedge and with a wrought @-@ iron gate entrance . For cemeteries in France and Belgium , a land tablet near the entrance or along a wall identifies the cemetery grounds as having been provided by the French or Belgian governments . All but the smallest cemeteries contain a register with an inventory of the burials , a plan of the plots and rows , and a basic history of the cemetery . The register is located within a metal cupboard that is marked with a cross located in either the wall near the cemetery entrance or in a shelter within the cemetery . More recently , in larger sites , a stainless steel notice gives details of the respective military campaign . The headstones within the cemetery are of a uniform size and design and mark plots of equal size . + The cemetery grounds are , except in drier climates , grass covered with a floral border around the headstones . There is also an absence of any paving between the headstone rows which is intended to make the cemetery feel like a traditional walled garden where visitors could experience a sense of peace . However , Carter and Jackson argue that the uniform aesthetics are designed to evoke a positive experience which deliberately masks and sanitises the nature of the war deaths . + + = = = = Cross of Sacrifice and Stone of Remembrance = = = = + + Typically , cemeteries of more than 40 graves contain a Cross of Sacrifice designed by architect Reginald Blomfield . This cross was designed to imitate medieval crosses found in churchyards in England with proportions more commonly seen in the Celtic cross . The cross is normally a freestanding four @-@ point limestone Latin cross , mounted on an octagonal base , and ranging in height from 14 to 32 feet . A bronze longsword , blade down , is embedded on the face of the cross . This cross represents the faith of the majority of the dead and the sword represents the military character of the cemetery , intended to link British soldiers and the Christian concept of self @-@ sacrifice . + Cemeteries with more than 1000 burials typically have a Stone of Remembrance , designed by Edwin Lutyens with the inscription " Their Name Liveth for Evermore " . The concept of the Stone of Remembrance stone was developed by Rudyard Kipling to commemorate those of all faiths and none respectively . In contrast to the Cross of Sacrifice , the design for the stone deliberately avoided " shapes associated with particular religions " . The geometry of the structure was based on studies of the Parthenon . Each stone is 3 @.@ 5 metres ( 11 ft ) long and 1 @.@ 5 metres ( 4 @.@ 9 ft ) high . The shape of the stone has been compared both to that of a sarcophagus and an altar . The feature was designed using the principle of entasis . The subtle curves in the design , if extended , would form a sphere 1 @,@ 801 feet 8 inches ( 549 @.@ 15 m ) in diameter . + + = = = = Headstones = = = = + + Every grave is marked with a headstone . Each headstone contains the national emblem or regimental badge , rank , name , unit , date of death and age of each casualty inscribed above an appropriate religious symbol and a more personal dedication chosen by relatives . The headstones use a standard upper case lettering designed by MacDonald Gill . Individual graves are arranged , where possible , in straight rows and marked by uniform headstones , the vast majority of which are made of Portland stone . The original headstone dimensions were 76 centimetres ( 30 in ) tall , 38 cm ( 15 in ) wide , and 7 @.@ 6 cm ( 3 @.@ 0 in ) thick . + Most headstones are inscribed with a cross , except for those deceased known to be atheist or non @-@ Christian . In the case of burials of Victoria Cross or George Cross recipients , the regimental badge is supplemented by the Victoria Cross or George Cross emblem . Sometimes a soldier employed a pseudonym because they were too young to serve or were sought by law enforcement ; in such cases their primary name is shown along with the notation " served as " . Many headstones are for unidentified casualties ; they consequently bear only what could be discovered from the body . The epitaph , developed by Rudyard Kipling , that appears on the graves of unidentified soldiers for which no details are known is " A Soldier of the Great War known unto God " . Some headstones bear the text " believed to be buried in this cemetery " when they are believed to be buried in the cemetery but the exact location of the grave is not known . In some cases soldiers were buried in collective graves and distinguishing one body from another was not possible and thus one headstone covers more than one grave . The headstone does not denote any specific details of the death except for its date , and even then only if it is known , and are deliberately ambiguous about the cause of death . + Due to local conditions it was sometimes necessary for the Commission to deviate from its standard design . In places prone to extreme weather or earthquakes , such as Thailand and Turkey , stone @-@ faced pedestal markers are used instead of the normal headstones . These measures are intended to prevent masonry being damaged during earthquakes or sinking into sodden ground . In Italy headstones were carved from Chiampo Perla limestone because it was in more plentiful supply . In Struma Military Cemetery , in Greece , to avoid risk of earthquake damage , small headstones are laid flat on the ground . The smaller size of the markers mean that they often lack unit insignia . + + = = = = Horticulture = = = = + + Commission cemeteries are distinctive in treating floriculture as an integral part of the cemetery design . Originally , the horticultural concept was to create an environment where visitors could experience a sense of peace in a setting , in contrast to traditionally bleak graveyards . Recommendations given by Arthur William Hill , the Assistant Director of the Royal Botanical Gardens at Kew enabled the Commission to develop cemetery layouts and architectural structures that took into account the placement of suitable plant life . Combining structural and horticultural elements was not unfamiliar to the Commission 's architects . Sir Edwin Lutyens furthered his long @-@ standing working relationship with horticulturist Gertrude Jekyll , whose devotion to traditional cottage garden plants and roses greatly influenced the appearance of the cemeteries . Where possible , indigenous plants were utilised to enhance sentimental associations with the gardens of home . + Variety in texture , height and timing of floral display were equally important horticultural considerations . The beds around each headstone are planted with a mixture of floribunda roses and herbaceous perennials . Low @-@ growing plants are chosen for areas immediately in front of headstones , ensuring that inscriptions are not obscured and preventing soil from splashing back during rain . In cemeteries where there are pedestal grave markers , dwarf varieties of plants are used instead . + The absence of any form of paving between the headstone rows contributes to the simplicity of the cemetery designs . Lawn paths add to the garden ambiance , and are irrigated during the dry season in countries where there is insufficient rain . Where irrigation is inappropriate or impractical , dry landscaping is an ecological alternative favoured by the Commission 's horticulturists , as is the case in Iraq . Drier areas require a different approach not only for lawns , but also to plants and styles of planting . Similarly , there are separate horticultural considerations in tropical climates . When many cemeteries are concentrated within a limited area , like along the Western Front or Gallipoli peninsula , mobile teams of gardeners operate from a local base . Elsewhere , larger cemeteries have their own dedicated staff while small cemeteries are usually tended by a single gardener working part @-@ time . + + = = Organisation = = + + + = = = Commissioners = = = + + The affairs of the CWGC are overseen by a Board of Commissioners . The president of the board is Prince Edward , Duke of Kent , the chairman is United Kingdom Secretary of State for Defence Michael Fallon and the vice @-@ chairman Vice @-@ Admiral Tim Laurence . The members are : the High Commissioner for New Zealand to the United Kingdom Lockwood Smith , the High Commissioners of Australia to the United Kingdom Alexander Downer , the Acting High Commissioner of the Republic of South Africa to the United Kingdom Obed Mlaba , the High Commissioner for India to the United Kingdom Ranjan Mathai , the High Commissioner for Canada to the United Kingdom Gordon Campbell , Hew Strachan , Keith Simpson , Kevan Jones , Edward Chaplin , Robert Fox , Ros Kelly and Lieutenant General Bill Rollo . Victoria Wallace is the Director @-@ General of the CWGC and serves as secretary . The board also has an Honorary Artistic Adviser , Peter Inskip . + + = = = Functional structure = = = + + The CWGC is headquartered in Maidenhead , England . Offices or agencies that are each responsible for a specific geographical area manage the worldwide affairs of the organisation . They are : + France Area is headed by a director and is responsible for France ( including the island of Corsica ) , Monaco and Switzerland . + Northern Europe Area , headed by a director and responsible for Austria , Belgium , Czech Republic , Denmark , Estonia , Germany , Hungary , Latvia , Lithuania , Luxembourg , Netherlands , Norway , Poland and Sweden . + United Kingdom Area , headed by a director and responsible for Channel Islands , Faroe Islands , Iceland , Ireland , Isle of Man and the United Kingdom + Mediterranean Area headed by a director and responsible for Albania , Algeria , Azerbaijan , Azores , Bahrain , Canary Islands , Croatia , Cyprus , Egypt , Gibraltar , Greece , Israel and Palestine , Italy , Jordan , Lebanon , Libya , Macedonia , Madeira , Malta , Mauritania , Morocco , Oman , Portugal , San Marino , Saudi Arabia , Serbia , Spain , Syria , Tunisia , Turkey , United Arab Emirates and Yemen + Canadian Agency is headed by a secretary @-@ general and responsible for Canada , the entire Americas ( including the Caribbean ) + Australia , managed by the Office of Australian War Graves in the Australian Department of Veterans Affairs on behalf of the CWGC , is responsible for Australia , Norfolk Island , Papua New Guinea and the Solomon Islands + New Zealand , managed by the New Zealand Ministry of Culture and Heritage on behalf of the CWGC , is responsible for New Zealand , New Caledonia , Samoa , Society Islands , Tonga and Vanuatu + South Africa Agency is headed by a secretary and is responsible for Republic of South Africa , Namibia , Saint Helena and Ascension Island + Africa , Asia and Pacific Area is headed by a director and is responsible for areas not covered by any of the other bodies . + + = = = Financing = = = + + The CWGC 's work is funded predominantly by grants from the governments of the six member states . In the fiscal year 2012 / 13 , these grants amounted to £ 58 @.@ 6 million of the organisation 's £ 66 @.@ 5 million of income . This equates to an approximate cost of C $ 85 per commemorated war dead . The contribution from each country is proportionate to the number of graves the CWGC maintains on behalf of that country . The percentage of total annual contributions for which each country is responsible is United Kingdom 78 @.@ 4 % , Canada 10 @.@ 1 % , Australia 6 @.@ 1 % , New Zealand 2 @.@ 1 % , South Africa 2 @.@ 1 % and India 1 @.@ 2 % . + + = = Ongoing projects and issues = = + + + = = = War Graves Photographic Project = = = + + A project is underway to photograph the graves of and memorials to all service personnel from 1914 to the present day and make the images available to the public . The work is being carried out by The War Graves Photographic Project in conjunction with the CWGC . As of August 2013 , the project has recorded 1 @.@ 7 million photographs for posterity . + + = = = Reburials and identifications = = = + + Immediately following the First World War , the British Army remained responsible for the exhumation of remains . The Western Front was divided into sectors and combed for bodies by 12 @-@ man exhumation units . Between the Armistice and September 1921 , the exhumation units reburied 204 @,@ 695 bodies . After 1921 , no further widespread search for bodies was undertaken and in February 1921 responsibility of the cemeteries was transferred to the Commission . Despite the rigorous searches , bodies continued to be discovered in numbers . In the three years following the conclusion of the general search 38 @,@ 000 bodies were discovered . In the mid 1920s , 20 to 30 bodies were being discovered weekly . + The discovery of remains of First and Second World War casualties remains a common occurrence with approximately 30 bodies discovered annually . For example , in 2006 eight bodies of Canadian soldiers from the 78th Battalion ( Winnipeg Grenadiers ) , CEF were discovered in a backyard in Hallu , France . In April 2013 , the remains of four British soldiers discovered by a French farmer clearing land with metal detector in 2009 were re @-@ interred at H.A.C. Cemetery near Arras , France . In March 2014 , the remains of 20 Commonwealth and 30 German soldiers were discovered in Vendin @-@ le @-@ Vieil , France with the Commonwealth soldiers being subsequently reburied at Loos British Cemetery . + When the remains of a Commonwealth soldier from the First or Second World War is discovered the Commission is notified and a Commission burial officer tries to collect any associated artifacts that may help in identify the remains . The details are then registered and archived at the Commission 's headquarters. the collection of evidence can include artifacts with the remains , anthropological data and DNA . The archival records of the commission are open to the public to permit individuals to conduct their own research . Investigation of archival records by members of the public periodically result in the identification of previously buried casualties . In December 2013 , it was discovered that Second Lieutenant Philip Frederick Cormack , who was previously commemorated on the Arras Flying Services Memorial , had in fact been buried in a French military cemetery in Machelen , East @-@ Flanders in Belgium . Sergeant Leonard Maidment was identified in 2013 after a visitor to Marfaux British Cemetery discovered a headstone of an unknown sergeant with the Hampshire Regiment killed on 20 July 1918 and was subsequently able to show that only one sergeant from that regiment had been killed in France on that date . + + = = = Vandalism = = = + + Cemeteries , including those of war dead , are targets for vandalism . The gravestones , cemeteries and buildings of the Commission are no exception . The Commission believes that graffiti and damage to stonework are usually the pursuits partaken by young people , noting the number of incidents increases when schoolchildren are on school holidays . Determined thieves will also steal the bronze swords off the Cross of Sacrifice , which are now replaced with identical ones made in fibreglass . + The vandalism of Commission cemeteries has also been connected to the participation of Commonwealth countries in contemporary conflicts . In the 1970s , in The Troubles , Commission cemeteries in Ireland experienced vandalism . Vandals defaced the central memorial of the Étaples Military Cemetery in northern France with anti @-@ British and anti @-@ American graffiti on 20 March 2003 immediately after the beginning of the Iraq War . On 9 May 2004 , thirty @-@ three headstones were demolished in the Gaza cemetery , which contains 3 @,@ 691 graves , allegedly in retaliation for the Abu Ghraib prisoner abuse scandal . On 24 February 2012 , during the Libyan Civil War , an Islamist militia damaged over 200 headstones in the Benghazi war cemetery as well as the central memorial . + + + = Tatwine = + + Tatwine or Tatwin ( Tatuini or Tadwinus ; c . 670 – 734 ) was the tenth Archbishop of Canterbury from 731 to 734 . Prior to becoming archbishop , he was a monk and abbot of a Benedictine monastery . Besides his ecclesiastical career , Tatwine was a writer , and riddles he composed survive . Another work he composed was on the grammar of the Latin language , which was aimed at advanced students of that language . He was subsequently considered a saint . + + = = Biography = = + + Tatwine was a Mercian by birth . His epigraph at Canterbury stated that when he died he was in old age , so perhaps he was born around 670 . He became a monk at the monastery at Breedon @-@ on @-@ the @-@ Hill in the present @-@ day County of Leicestershire , and then abbot of that house . Through the influence of King Æthelbald he was appointed as Archbishop of Canterbury in 731 and was consecrated on 10 June 731 . He was one of a number of Mercians who were appointed to Canterbury during the 730s and 740s . Apart from his consecration of the Bishops of Lindsey and Selsey in 733 , Tatwine 's period as archbishop appears to have been uneventful . He died in office on 30 July 734 . Later considered a saint , his feast day is 30 July . + + = = Writings = = + + Bede 's commentary on Tatwine calls him a " vir religione et Prudentia insignis , sacris quoque literis nobiliter instructus " ( a man notable for his prudence , devotion and learning ) . These qualities were displayed in the two surviving manuscripts of his riddles and four of his Ars Tatuini . The Ars is one of only two surviving 8th @-@ century Latin grammars from England , and was based on the works of Priscian and Consentius . The riddles deal with such diverse topics as philosophy and charity , the five senses and the alphabet , and a book and a pen . The riddles are formed in acrostics . The grammar is a reworking of Donatus 's Ars Minor with the addition of information drawn from other grammarians . It was not designed for a newcomer to the Latin language , but is designed for more advanced students . It covers the eight parts of speech through illustrations drawn from classical scholars , although not directly but through other grammatical works . There are also some examples drawn from the Psalms . The work was completed before he became archbishop , and was used not only in England but also on the continent . A recent edition of his works is Tatuini Opera omnia , published in 1968 with some translations into English and German from the original Latin . + + + = German Type UB I submarine = + + The Type UB I was a class of small coastal submarines ( U @-@ boats ) built in Germany at the beginning of the First World War . 20 boats were constructed , most of which went into service with the German Imperial Navy . Boats of this design were also operated by the Austro @-@ Hungarian Navy ( Kaiserliche und Königliche Kriegsmarine or K.u.K. Kriegsmarine ) and the Bulgarian Navy . The group is sometimes known as the UB @-@ 1 class after SM UB @-@ 1 , the class leader . In the Austro @-@ Hungarian Navy , it was called the U @-@ 10 class . + Built to meet the need for small maneuverable submarines able to operate in the narrow , shallow seas off Flanders , the vessels were intended to be quickly constructed , then shipped by rail and assembled at their port of operation . The design effort began in mid @-@ August 1914 and by mid @-@ October the first 15 boats were ordered from two German shipyards . The German Imperial Navy subsequently ordered an additional pair of boats to replace two sold to Austria @-@ Hungary , who ordered a further three boats in April 1915 . A total of 20 UB Is were built . Construction of the first boats for Germany began in early November 1914 ; all 20 were completed by October 1915 . Several of the first boats underwent trials in German home waters , but the rest were assembled and tested at either Antwerp or Pola . The German boats operated primarily in the Flanders , Baltic , and Constantinople Flotillas . The boats were about 28 metres ( 92 ft ) long and displaced 127 tonnes ( 125 long tons ) when surfaced and 142 tonnes ( 140 long tons ) while submerged . All had two bow torpedo tubes and two torpedoes , and were equipped with a deck @-@ mounted machine gun . + In 1918 four of the surviving German boats were converted into coastal minelayers . Of the seventeen boats in German service , two were sold to Austria @-@ Hungary , one was sold to Bulgaria , and nine were lost during the war . One of the five Austro @-@ Hungarian boats was sunk and another mined and not repaired . The five surviving German boats , the four surviving Austro @-@ Hungarian boats , and the Bulgarian boat were all turned over to the Allies after the end of the war and were broken up . + + = = Design = = + + In the earliest stages of the First World War the German Army 's rapid advance along the North Sea coast found the German Imperial Navy without submarines suitable to operate in the narrow and shallow seas off Flanders . By 18 August 1914 , two weeks after the German invasion of Belgium , the planning of a series of small coastal submarines had already begun . + The German Imperial Navy stipulated that the submarines must be transportable by rail , which imposed a maximum diameter of 3 @.@ 15 metres ( 10 ft 4 in ) . The rushed planning effort — which had been assigned the name " Project 34 " — resulted in the Type UB I design , created specifically for operation from Flanders . The boats were to be about 28 metres ( 92 ft ) long and to displace about 125 tonnes ( 123 long tons ) with two bow torpedo tubes . + Boats of the Type UB I design were built by two manufacturers , Germaniawerft of Kiel and AG Weser of Bremen , which led to some variations in boats from the two shipyards . The eight Germaniawerft @-@ built boats were slightly longer at 28 @.@ 10 metres ( 92 ft 2 in ) length overall , while the twelve Weser @-@ built boats came in 22 centimetres ( 8 @.@ 7 in ) shorter than their counterparts . All were 3 @.@ 15 metres ( 10 ft 4 in ) abeam and had a draft of 3 @.@ 03 metres ( 9 ft 11 in ) . The boats all displaced 127 tonnes ( 125 long tons ) while surfaced , but differed slightly in displacement submerged . The slightly longer Germaniawerft boats displaced 142 tonnes ( 140 long tons ) while submerged , as they weighed 1 tonne ( 0 @.@ 98 long tons ) more than the Weser boats . + The drivetrain of the boats consisted of a single propeller shaft driven by a Daimler ( Germaniawerft ) or Körting ( Weser ) diesel engine on the surface , or a Siemens @-@ Schuckert electric motor for underwater travel . The Weser boats were capable of nearly 7 @.@ 5 knots ( 13 @.@ 9 km / h ; 8 @.@ 6 mph ) on the surface and a little more than 6 knots ( 11 km / h ; 6 @.@ 9 mph ) submerged . The Germaniawerft boats were about 1 knot ( 1 @.@ 9 km / h ; 1 @.@ 2 mph ) slower than their Bremen @-@ made counterparts . The boats were equipped with two 45 @-@ centimetre ( 17 @.@ 7 in ) bow torpedo tubes and carried two torpedoes . They were also armed with a single 8 @-@ millimetre ( 0 @.@ 31 in ) machine gun affixed to the deck . + + = = Construction = = + + The German Imperial Navy ordered its first fifteen Type UB I boats on 15 October 1914 . Eight boats — numbered UB @-@ 1 to UB @-@ 8 — were ordered from Germaniawerft of Kiel , and seven boats — numbered UB @-@ 9 to U @-@ 15 — from AG Weser of Bremen . After two of the class , UB @-@ 1 and UB @-@ 15 , were sold in February 1915 to ally Austria @-@ Hungary ( becoming U @-@ 10 and U @-@ 11 in the Austro @-@ Hungarian Navy ) , the German Imperial Navy ordered UB @-@ 16 and UB @-@ 17 from Weser . A further three for Austria @-@ Hungary — U @-@ 15 , U @-@ 16 , and U @-@ 17 — had been ordered from Weser by April , bringing the total number constructed to 20 . + UB @-@ 1 and UB @-@ 2 were laid down on 1 November 1914 at the Germaniawerft yard at Kiel . UB @-@ 1 was launched on 22 January 1915 , just 75 working days later . UB @-@ 2 's launch followed on 13 February . Among the Weser boats , UB @-@ 9 was laid down first , on 6 November 1914 , and launched on 6 February 1915 , a week ahead of UB @-@ 2 . These first three boats launched underwent trials in home waters , but most of the other members of the class were shipped via rail and underwent trials at their assembly point . + The process of shipping the submarines by rail involved breaking the submarines down into what was essentially a knock down kit . Each boat was broken into approximately fifteen pieces and loaded on to eight railway flatcars . Type UB I boats destined for service with the Flanders Flotilla made a five @-@ day journey to Antwerp for the two- to three @-@ week assembly process . After assembly at Antwerp the boats were towed by barge to Bruges for trials . Boats selected for service in the Mediterranean were sent to the Austro @-@ Hungarian port of Pola for assembly . The total time from departure of the railcars from the shipyard to operational readiness for the boats was about six weeks . + By July 1915 all seventeen of the German Imperial Navy Type UB Is had been completed . + + = = Service = = + + During their trials the Type UB Is were found to be too small and too slow and had a reputation for being underpowered ; one commander compared his Type UB I to a " sewing machine " . According to authors R. H. Gibson and Maurice Prendergast in their 1931 book The German Submarine War , 1914 – 1918 , the UBs did not have enough power to chase down steamers while surfaced and lacked the endurance to spend any extended amount of time underwater , exhausting their batteries after little over an hour 's running . In @-@ service use revealed another problem : with a single propeller shaft / engine combination , if either component failed , the U @-@ boat was almost totally disabled . + Another reported problem with the Type UB Is was the tendency to break trim after the firing of torpedoes . The boats were equipped with compensating tanks designed to flood and offset the loss of the C / 06 torpedo 's 1 @,@ 700 @-@ pound ( 770 kg ) weight , but this system did not always function properly ; as a result , when firing from periscope depth the boat could broach after firing or , if too much weight was taken on , plunge to the depths . When UB @-@ 15 torpedoed and sank Italian submarine Medusa in June 1915 , the tank failed to properly compensate , forcing the entire crew to run to the stern to offset the trim imbalance . + Despite the problems , the " tin tadpoles " , as the Germans referred to them , were in active service from March 1915 through the end of the war , with half of the 20 boats lost during the war . Boats of the class served in three navies : the German Imperial Navy , the Austro @-@ Hungarian Navy , and the Bulgarian Navy . In German service , they served primarily in the Flanders Flotilla , the Baltic Flotilla , and the Constantinople Flotilla . + + = = = German Imperial Navy = = = + + + = = = = Flanders Flotilla = = = = + + The first Type UB I to enter service was UB @-@ 10 , which formed the nucleus of the Flanders Flotilla , on 27 March 1915 . By the end of April five more Type UB I boats had become operational . UB @-@ 10 was eventually joined in the Flanders Flotilla by UB @-@ 2 , UB @-@ 4 , UB @-@ 5 , UB @-@ 6 , UB @-@ 12 , UB @-@ 13 , UB @-@ 16 , and UB @-@ 17 ; of these , only UB @-@ 2 made the journey to Flanders by sea rather than rail . + UB @-@ 4 departed on the first patrol from Flanders on 9 April , and was responsible for sinking the first ship sent down by the flotilla . The Type UB I boats of the Flanders Flotilla originally patrolled the area between the United Kingdom and the Netherlands , but began patrolling the English Channel after UB @-@ 6 pioneered a route past British antisubmarine nets and mines in the Straits of Dover in late June . + Over the Type UB Is ' first year of service , UB @-@ 4 and UB @-@ 13 were both lost , and UB @-@ 2 and UB @-@ 5 were transferred to the Baltic Flotilla . In March 1917 , UB @-@ 6 ran aground in Dutch waters and was interned for the rest of the war , along with her crew . The four remaining Type UB Is in Flanders — UB @-@ 10 , UB @-@ 12 , UB @-@ 16 , UB @-@ 17 — were all converted to minelayers by 1918 , having their torpedo tubes removed and replaced with chutes to carry up to eight mines . All but UB @-@ 10 were lost in 1918 ; UB @-@ 10 , in poor repair and out of service , was scuttled in October 1918 when the Germans evacuated from Flanders . + + = = = = Baltic Flotilla = = = = + + UB @-@ 9 was initially assigned to the Baltic Flotilla , and was joined by UB @-@ 2 and UB @-@ 5 in early 1916 . All three became training boats at Kiel in 1916 , joining UB @-@ 11 in that duty . Little information is available about the Type UB I boats operating in the Baltic . + + = = = = Constantinople Flotilla = = = = + + Four of the German Imperial Navy boats — UB @-@ 3 , UB @-@ 7 , UB @-@ 8 , and UB @-@ 14 — were selected for service with the Constantinople Flotilla . All were sent to Pola for assembly and trials there as part of the Pola Flotilla before sailing on to join the Constantinople Flotilla . UB @-@ 3 disappeared en route to Constantinople in May 1915 , but the other three arrived there by mid @-@ June . + The three Type UB I boats of the Constantinople Flotilla seem to have patrolled primarily in the Black Sea . UB @-@ 8 was transferred to the Bulgarian Navy in May 1916 , and UB @-@ 7 disappeared in the Black Sea in October 1916 , leaving UB @-@ 14 as the sole remaining German Type UB I in the flotilla ; she was surrendered at Sevastopol in November 1918 to French armies stationed there during the Russian Civil War . + + = = = Austro @-@ Hungarian Navy = = = + + UB @-@ 1 and the still incomplete UB @-@ 15 were sold to the Austria @-@ Hungary in February 1915 ; both were dismantled and shipped to Pola in May . After one cruise under the German flag , each boat was commissioned into the Austro @-@ Hungarian Navy . The pair — renamed U @-@ 10 and U @-@ 11 , respectively — were joined by U @-@ 15 , U @-@ 16 , and U @-@ 17 in October . Known as the U @-@ 10 or the Okarina ( English : Ocarina ) class as a part of the Austro @-@ Hungarian Navy , the five boats operated primarily in the Adriatic in patrols off Italy and Albania . U @-@ 10 ( ex UB @-@ 1 ) hit a mine in July 1918 and was beached , but had not been repaired by the end of the war . U @-@ 16 was sunk after she torpedoed an Italian destroyer in October 1916 , and the remaining three ( and the unrepaired U @-@ 10 ) were ceded to Italy at the end of the war . + + = = = Bulgarian Navy = = = + + After UB @-@ 8 was transferred to the Bulgarian Navy in May 1916 , she was renamed Podvodnik No. 18 ( in Cyrillic : Пoдвoдник No. 18 ) . She was Bulgaria 's first submarine , and was engaged primarily in coastal defense duties off Bulgaria 's main Black Sea port of Varna . Podvodnik No. 18 survived the war and was ceded to France after the Treaty of Neuilly @-@ sur @-@ Seine . + + = = List of Type UB I submarines = = + + 20 Type UB I submarines were built , 17 for the German Imperial Navy and three for the Austro @-@ Hungarian Navy . Two of the German submarines — UB @-@ 1 and UB @-@ 15 — were sold to Austria @-@ Hungary and commissioned into the Austro @-@ Hungarian Navy as U @-@ 10 and U @-@ 11 , respectively . Those two and a further three built by AG Weser comprised the virtually identical U @-@ 10 class for the Austro @-@ Hungarian Navy . Another of the German submarines , UB @-@ 8 , was sold to Bulgaria in May 1916 , becoming Podvodnik No. 18 . + + = = = German Imperial Navy = = = + + SM UB @-@ 1 ( became the Austro @-@ Hungarian U @-@ 10 , July 1915 ) + SM UB @-@ 2 + SM UB @-@ 3 + SM UB @-@ 4 + SM UB @-@ 5 + SM UB @-@ 6 + SM UB @-@ 7 + SM UB @-@ 8 ( became the Bulgarian Podvodnik No. 18 , May 1916 ) + SM UB @-@ 9 + SM UB @-@ 10 + SM UB @-@ 11 + SM UB @-@ 12 + SM UB @-@ 13 + SM UB @-@ 14 + SM UB @-@ 15 ( became the Austro @-@ Hungarian U @-@ 11 , June 1915 ) + SM UB @-@ 16 + SM UB @-@ 17 + + = = = Austro @-@ Hungarian Navy = = = + + In the Austro @-@ Hungarian Navy the Type UB I boats were known as the U @-@ 10 class , which consisted of two former German Type UB I boats and three built specifically for Austria @-@ Hungary . + SM U @-@ 10 ( the former German UB @-@ 1 ) + SM U @-@ 11 ( the former German UB @-@ 15 ) + SM U @-@ 15 ( Austria @-@ Hungary ) + SM U @-@ 16 ( Austria @-@ Hungary ) + SM U @-@ 17 ( Austria @-@ Hungary ) + In addition , four of the German Type UB Is assigned to the Pola Flotilla based at the Austro @-@ Hungarian Navy 's main naval base at Pola were assigned Austro @-@ Hungarian designations . + SM UB @-@ 3 ( as U @-@ 9 ) + SM UB @-@ 7 ( as U @-@ 7 ) + SM UB @-@ 8 ( as U @-@ 8 ) + SM UB @-@ 14 ( as U @-@ 26 ) + These four boats remained under commission in the German Imperial Navy , retained German crews and commanders , and received orders from the German flotilla commander at Pola . + + = = = Bulgarian Navy = = = + + Germany and Bulgaria negotiated the purchase of two UB I boats for the Bulgarian Navy , UB @-@ 7 and UB @-@ 8 , in 1916 . Two crews of Bulgarian sailors were sent to Kiel for training . Before the purchase could be completed , UB @-@ 7 was sunk , leaving only one boat for Bulgaria . On 25 May 1916 , UB @-@ 8 was officially transferred to Bulgaria for the remainder of the war . + Podvodnik No. 18 ( the former German UB @-@ 8 ) + Key + + + = Military history of Gibraltar during World War II = + + The military history of Gibraltar during World War II exemplifies Gibraltar 's position as a British fortress since the early 18th century and as a vital factor in British military strategy , both as a foothold on the continent of Europe , and as a bastion of British sea power . During World War II , Gibraltar served a vital role in both the Atlantic Theatre and the Mediterranean Theatre , controlling virtually all naval traffic into and out of the Mediterranean Sea from the Atlantic Ocean . + In addition to its commanding position , Gibraltar provided a strongly defended harbour from which ships could operate in both the Atlantic and the Mediterranean . Force H , under the command of Vice @-@ Admiral James Somerville was based in Gibraltar and had the task of maintaining naval superiority and providing a strong escort for convoys to and from the besieged island of Malta . During the course of the war , Gibraltar came under aerial bombardment from Vichy French aircraft and from aircraft of the Italian Royal Air Force ( Regia Aeronautica ) based on Sardinia . Additionally , the fortress was the focus of underwater attacks by the Italian Royal Navy ( Regia Marina ) commando frogman unit ( Decima Flottiglia MAS ) and their human torpedoes . This Italian unit was based on the interned Italian ship SS Olterra in the nearby Spanish harbour of Algeciras . A number of attacks were also carried out by Spanish and Gibraltarian agents acting on behalf of the German Abwehr . + Inside the Rock of Gibraltar itself , miles of tunnels were excavated from the limestone . Masses of rock were blasted out to build an " underground city " . In huge man @-@ made caverns , barracks , offices , and a fully equipped hospital were constructed , complete with an operating theatre and X @-@ ray equipment . + Operation Torch , the Allied invasion of French North Africa in November 1942 , was coordinated from the " Rock " . General Dwight D. Eisenhower , who was given command of the operation , set up his headquarters in Gibraltar during the planning phases of the operation . Following the successful completion of the North African campaign and the surrender of Italy in 1943 , Gibraltar 's role shifted from a forward operating base to a rear @-@ area supply position . The harbour continued to operate dry docks and supply depots for the convoy routes through the Mediterranean until V @-@ E Day in 1945 . + + = = Prelude and evacuation = = + + World War II dramatically changed the lives of Gibraltarians . The decision to enforce mass evacuation in order to increase the strength of the Rock with more military and naval personnel meant that most Gibraltarians ( some for up to ten years ) had nowhere to call ' home ' . Only those civilians with essential jobs were allowed to stay but it gave the entire community a sense of being ' British ' by sharing in the war effort . + In early June 1940 , about 13 @,@ 500 evacuees were shipped to Casablanca in French Morocco . However , following the capitulation of the French to the German armies later in June 1940 , the new Pro @-@ German French Vichy Government found the presence of Gibraltarian evacuees in Casablanca an embarrassment and sought opportunities for their removal . The opportunity soon arose when 15 British cargo vessels arrived under Commodore Crichton , repatriating 15 @,@ 000 French servicemen who had been rescued from Dunkirk . Once their own rescued servicemen had disembarked , the ships were interned until they agreed to take away all the evacuees . Although Crichton was unable to obtain permission to clean and restock his ships ( and contrary to British Admiralty orders which forbade the taking on of evacuees ) , when he saw the mass of civilians pouring through the dockyards , he opened up his gangways for boarding . Just beforehand , the British fleet had destroyed a number of French warships at Mers el @-@ Kebir in order to prevent them ending up in German hands . The attack , during which 1 @,@ 297 French sailors died , led to high tensions , which were evident when families were forced at bayonet point by French troops to board taking only what they could carry , leaving many possessions behind . However , when they arrived at Gibraltar , the Governor would not allow them to land , fearing that once the evacuees were back on the Rock , it would be virtually impossible to evacuate them a second time . Crowds gathered in John Mackintosh Square in the centre of Gibraltar as the news broke , speeches were made and two City Councillors accompanied by the Acting President of the Exchange and Commercial Library went to see the Governor ( Sir Clive Liddell ) to ask that the evacuees be allowed to land . After receiving instructions from London , a landing was allowed as long as the evacuees returned when other ships arrived to take them away from the Rock , and by 13 July the re @-@ evacuation back to Gibraltar had been completed . + British conservative politician Oliver Stanley agreed to accept the evacuees in the United Kingdom , but he argued with Gibraltar over the number of people involved . The Governor , he declared , had given the number of evacuees first as 13 @,@ 000 , then as 14 @,@ 000 and finally as 16 @,@ 000 . He asked for the situation to be clarified , stressing the shortage of accommodation in Britain and insisting that only 13 @,@ 000 could be accepted , 2 @,@ 000 of whom were to be sent to the Portuguese Atlantic island of Madeira . The situation , replied General Liddell on 19 July , " is that this is a fortress liable to heavy and immediate attack and there should be no civilians here whereas there are 22 @,@ 000 . The 13 @,@ 000 was the number sent to Morocco , and more would have been sent had the situation there not altered . " In London the evacuees were placed in the hands of the Ministry of Health , and many were housed in Kensington area . Concern for them in Gibraltar mounted as the air raids against London intensified , coupled with the arrival of harrowing letters , describing the circumstances in which the evacuees were living . + In September rumours were already circulating among the evacuees , and in Gibraltar , that the possibility of re @-@ evacuating the Gibraltarians once more was being mooted , this time the destination being Jamaica , in the West Indies . After much contention , it was decided to send a party directly from Gibraltar to the island , and 1 @,@ 093 evacuees left for Jamaica direct , on 9 October , with more following later on . However petitions followed and the demands were met , partly for strategic reasons and the lack of available shipping . The situation at the end of 1940 , therefore , was that approximately 2 @,@ 000 evacuees were in Jamaica and a lesser number in Madeira , with the bulk of around 10 @,@ 000 housed in the London area . + + = = Royal Air Force involvement : 1939 – 1941 = = + + Construction of a solid surface runway began in late 1939 and in 1940 it was proposed to extend the existing runway to a length of 1 @,@ 550 yards ( 1 @,@ 417 m ) . The land reclamation commenced towards the end of 1941 along with the construction of an RAF camp at the " North Front " , now RAF Gibraltar . The RAF dispatched their next squadron to Gibraltar at this time and it was in September 1939 that war with Germany was declared and the strong possibility of German submarines concentrating in the Strait of Gibraltar and using Spanish port facilities , loomed large in Admiralty thinking . So at 09 : 00 ( UTC ) on the 9 September 1939 , No. 202 Squadron RAF was ordered to Gibraltar , loaded to the gunwales with equipment . + On 25 September 1939 , No 200 ( Coastal ) Group was formed as a subordinate formation to HQ RAF Mediterranean in control of No 202 Sqn . The Group 's function was the control of Royal Air Force units operating from Gibraltar . In late 1940 the Group was transferred to Coastal Command . Later a combined HQ was formed which commenced operations in early 1942 . + + = = Vichy French attacks : 1940 = = + + On 18 July 1940 , after the attack on the French Fleet at Mers @-@ el @-@ Kébir by the British , the Vichy government authorized a bombing raid of Gibraltar as a response . Little damage was reported to have been done . + On Tuesday , 24 September , the Italian Stefani news agency reported : " As a reprisal for the bombardment of Dakar yesterday morning , one @-@ hundred @-@ and @-@ twenty French aircraft based in Morocco attacked Gibraltar . " On the same day , the United Press Agency reported : " The French government has issued an official denial of reports , according to which French aircraft were said to have attacked Gibraltar . Up until now , no reprisals have been undertaken . " But the United Press report ended on an ominous note with : " French reprisals are imminent . " + Again , on the same day , the Vichy French government issued orders for the naval base and city of Gibraltar to be bombarded . As a result , six bomber squadrons of the Vichy French Air Force ( Armée de l 'Air de Vichy ) and four squadrons of the Vichy French Navy ( Marine nationale de Vichy ) were employed in the operation . The 64 bombers flew from bases in Oran , Tafaroui ( in Algeria ) , Meknes , Mediouna , and Port Lyautey ( in Morocco ) . The French action was approved by both the German Armistice Commission and the Italian Armistice Commission . + No British aircraft were encountered and much damage was done in the area south of the fortress . The South Mole and a large ship in the harbour were heavily damaged . In the northern part of Gibraltar , fires broke out . + On 25 September , the French returned with a larger force of eighty @-@ three bombers to cause additional damage to the naval base and harbour installations . Again , aircraft of the British Royal Air Force made no appearance . However , the French crews did report encountering heavy anti @-@ aircraft fire . One LeO 451 bomber was lost and 13 other aircraft were lightly damaged during the two days of bombing attacks . The British armed trawler HMT Stella Sirius was sunk by bombs . + The air attack on 25 September was the last by Vichy forces on Gibraltar . + + = = Operation Felix : 1940 – 1941 = = + + The Rock came through the war relatively unscathed but , given its strategic importance , Germany made plans to capture Gibraltar . Codenamed " Felix " , the plan which was signed by Adolf Hitler himself was formulated at the highest level of command . With or without permission , Germany would take entry through Spain and attack Gibraltar driving the British out of the Western Mediterranean . The Strait would be effectively closed to the Allies once Gibraltar was in German hands , forcing Asia @-@ bound Allied shipping to steam all the way around Africa rather than to proceed to the east via the shorter route through the Mediterranean and the Suez Canal . The Rock was to be heavily dive bombed by planes leaving France but landing afterward at Spanish air bases . To deny a possible Spanish capture of the base , the German planners decided that the final attack to seize Gibraltar was to be made by German troops alone . + Diplomatic failure at the highest levels of government prevented the operation from occurring at the beginning of 1941 which had been drawn up in detail by the Wehrmacht in the summer and autumn of 1940 . + General Ludwig Kübler 's XLIX Corps would conduct the actual attack on the Rock . The assault forces would comprise the Infantry Regiment Großdeutschland , the 98th Regiment of the 1st Mountain Division , 26 medium and heavy artillery battalions , three observation battalions , three engineer battalions , two smoke battalions , a detachment of 150 Brandenburgers , and up to 150 miniature remote controlled demolition vehicles ( Goliaths ) , packed with high explosives . + As part of a combined @-@ force operation , the German Air Force ( Luftwaffe ) would contribute Ju 88As , Stukas , Messerschmitts , three light AA battalions , and three heavy AA battalions . Nazi Germany 's Kriegsmarine would cooperate by using U @-@ boats to interfere with British naval movement and emplacing coastal batteries to further discourage the Royal Navy . + On 10 March 1941 , with Operation Barbarossa looming , Felix was amended to Operation Felix @-@ Heinrich , whereby German troops would be withdrawn from the USSR to capture Gibraltar . As a result of Spanish dictator Francisco Franco 's intransigence , the operation was postponed , modified , and ultimately abandoned . + + = = Italian bombing of Gibraltar = = + + From Sardinia , Italian Piaggio P.108 bombers attacked Gibraltar several times , mainly in 1942 . The last raids on Gibraltar were done during the 1943 Allied landing in Algeria , when those bombers hit successfully even the Oran port . + The only unit of the Regia Aeronautica ( Royal Air Force ) ever to fly the Piaggio P.108 was the " 274th Long @-@ Range Bombardment Squadron " . This unit was formed in May 1941 around the first machines that came off the assembly lines . The training of the crews lasted far longer than anticipated and only in June 1942 the 274th became operational . The most spectacular raids with the P. 108 bombers were flown in October 1942 when several night attacks against Gibraltar were undertaken from Sardinia . + After the armistice of Cassibile ( 8 September ) , the German @-@ allied Italian Social Republic launched at least two raids on Gibraltar : one on the night of 4 – 5 June 1944 with ten SM.79bis aircraft and another on 6 June with nine aircraft . Both sorties were undertaken by the Gruppo Aerosiluranti " Buscaglia – Faggioni " . + + = = Italian frogmen raids 1940 – 1943 = = + + Known as the " Floating Trojan Horse of Gibraltar " , Decima Flottiglia MAS , an Italian commando frogman unit created during the Fascist government , engaged in numerous attacks against the harbour at Gibraltar . + Gibraltar was a very tempting target for the Italians , who saw it as a refuge for British warships and allied merchant shipping . The Italian frogmen originally used a Spanish villa ( Villa Carmela ) located two miles ( 3 km ) from Gibraltar owned by an Italian officer who had married a Spanish woman named Conchita Ramognino . Their base was shifted later to the Italian tanker SS Olterra , interned in Algeciras . + + = = Abwehr saboteurs from Spain = = + + Lesser known than the Italian actions were the sabotage operations and limpet @-@ mine attacks carried out by Spanish and Gibraltarian agents recruited in the Campo de Gibraltar by the Germans . The Abwehr contacted a Spanish staff officer from Campo de Gibraltar , Lieutenant Colonel Eleuterio Sánchez Rubio , a Spanish officer , member of the Falange and coordinator of the intelligence operations in the Campo , to establish a network of saboteurs with access to Gibraltar . Sánchez Rubio designated Emilio Plazas Tejera , also a member of Falange , as operations chief of the organisation . Most of the recruits for the sabotage operations were Spaniards from the Campo . A combination of financial reward , ideological commitment and some threats and intimidation were used to gather a significant number of agents . According to the British intelligence , there were at least 183 Spaniards and Gibraltarians involved in the espionage and sabotage operations against Gibraltar . + Sabotage operations were ordered from Berlin in the late autumn of 1940 , but actual work did not start until early 1941 . The first operations were unsuccessful . A first attempt to smuggle a bomb into Gibraltar was aborted , as the timing device was faulty . In February there was a large explosion in the North Tunnel , and in April a bomb blew up near the airfield . In June 1941 , however , the British intelligence foiled a new attempt , by a German agent , to attach a mine alongside an Allied cargo ship . Another attempt failed when Plazas placed a bomb inside an ammunition store but was not able to bring the explosive . It was not until 1942 that the operations begun to succeed . In January 1942 , two Spanish agents manage to destroy two aircraft at the North Front landing strip . + Financed , trained and equipped by the Germans , the saboteurs sank the armed trawler HMT Erin , and destroyed the auxiliary minesweeper HMT Honju , which resulted in the deaths of six British seamen on 18 January 1942 . Plazas was assisted by the Spanish naval commander of Puente Mayorga , Manuel Romero Hume , who allowed him to beach a rowboat there . The British intelligence was able however to counteract the sabotage operations . In March 1942 , a Gibraltarian , José Key , one of the most prominent agents working for the Germans , responsible for the collection of information on military movements for the Abwehr was arrested and executed in Wandsworth Prison in late 1942 . By September 1942 , Plazas , whose activities were closely monitored by the British at that time , resigned and left Carlos Calvo , his second in command , in charge of the operations . In late 1942 , the German headquarters in Berlin ordered the sabotage operations being expanded . In early 1943 , the arrival of an experienced head of Abwehr operations in Spain improved the outreach of the operations . + In March 1943 an ammunition dump was blown up by Calvo 's agents . The British , growing suspicious of some of the saboteurs , banned them from entering Gibraltar . This forced the Abwehr to ask Calvo for new personnel . A Spaniard working on the Rock , José Martín Muñoz , was responsible for the explosion and fire at a large fuel tank at Coaling Island on 30 June 1943 ; this mission , however , would be the first and the last for Muñoz , because he was cornered and arrested by British authorities in August , when he tried to smuggle a bomb into a weapons magazine inside Ragged Staff Cave . After being sentenced to death , he was hanged on 11 January 1944 in Gibraltar by British executioner Albert Pierrepoint . A member of an unrelated Abwehr sabotage network , Luis López Cordón @-@ Cuenca ( also arrested in 1943 ) was executed by Pierrepoint on the same day . Calvo himself was put under arrest by the Spanish police and neutralized . He would be a free man again in December , when he rejoined the Abwehr in Madrid , under direct orders of Wolfgang Blaum , aka Baumann , head of the sabotage section in Spain . After a Falangist attempt against the life of pro @-@ allied General José Enrique Varela , perpetrated by Sánchez Rubio network 's agent Juan José Domínguez and a meeting between Anthony Eden and the Spanish ambassador at London , Jacobo Fitz @-@ James Stuart , Abwehr activities around Gibraltar came to an end . + + = = Operation Tracer : 1941 – 1942 = = + + Operation Tracer was a top @-@ secret British stay @-@ behind spying mission that was only to be implemented if Gibraltar was captured by the Axis Powers . Six men were to be sealed in a cave and left with only enough supplies for a year . The volunteers — two doctors , three signalmen and their leader — would run an observation post with one 12 @-@ inch ( 300 mm ) by 6 @-@ inch ( 150 mm ) slit looking over the harbour and a concealed outdoor terrace over the Mediterranean . The team would then wire back all shipping movements to the British Admiralty . + They were told there would be no way out and anyone who died within the chamber would have to be embalmed and cemented into the brick floor . Only if Germany was defeated within their first year would they be released . + As the threat of invasion was clearly felt in late 1941 , an idea for a series of secret observation posts ( first in Gibraltar and later in other places like Malta and Aden ) was put together under Operation Tracer . + Work in Gibraltar began immediately under Commander Geoffrey Birley and his chief engineer Colonel Fordham . The site chosen at Lord Airey 's Battery on the southern tip of the Rock already had an existing tunnelling scheme for a shelter . Extensive trials of the equipment began in January 1942 under the eye of MI6 radio expert Colonel Richard Gambier @-@ Parry . Much thought was also given to the type of men needed for such a strange and demanding task . A member of Scott ’ s ill @-@ fated expedition to the Antarctic , George Murray Levick was called up as Surgeon @-@ Commander to advise on survival techniques . There were practical matters such as diet , exercise , sanitation , and clothing to consider as well as vital " psychology of the personnel " . The full team was in place by the end of summer 1942 and their cavern fully equipped and ready for occupation . A comprehensive manual was prepared on all aspects of the operation and it was considered that similar secret lookout posts should be prepared throughout the world in the event of future wars . However , Operation Tracer was never needed , as Adolf Hitler turned his attention away from Gibraltar and towards the Eastern Front . + The operation had been clouded in mystery until the discovery of papers at the Public Record Office in Kew UK . Previously in the 1960s , details of the story were told to a journalist by his intelligence service contacts and he wrote these up as " Operation Monkey " , yet facts were very sparse . + In 1997 " Stay Behind Cave " ( as it was nicknamed ) was discovered in Gibraltar by the Gibraltar Caving Group , but no account was ever obtained from anyone associated with the mission . The discovery came about when the group encountered a strong gust of wind in a tunnel . Further searching led them to break through a wall into chambers which had never been used and had remained sealed for over 50 years . + In November 2006 Jim Crone and Sergeant Major Pete Jackson , senior tunnel guide with the Royal Gibraltar Regiment , met possibly the only member of Operation Tracer still alive when they travelled to meet Dr. W. A. Bruce Cooper at his home in England . Cooper , 92 at the time , provided an opportunity to shed light on the operation with his direct involvement in the mission as a Surgeon @-@ Lieutenant in the Royal Navy Volunteer Reserve ( RNVR ) . He recalled stories about his colleagues , his training , and his feelings about the task . + + = = Mediterranean U @-@ boat Campaign : 1941 – 1944 = = + + The Mediterranean U @-@ boat Campaign lasted approximately from 21 September 1941 to May 1944 . The Kriegsmarine tried to isolate Gibraltar , Malta , and Suez and disrupt Britain 's trade routes . More than sixty U @-@ boats were sent to interdict Allied shipping in the Mediterranean Sea . Many of these U @-@ boats were themselves attacked negotiating the Strait of Gibraltar controlled by Britain . Nine U @-@ boats were sunk while attempting passage and ten more were damaged . + + = = North African Campaign : 1942 = = + + Plans for the Allied counter offensive after the attack on Pearl Harbor were ongoing by mid @-@ 1942 . An invasion of Europe in 1943 would be unworkable , but the allies could attack the " soft underbelly of Europe " through the Mediterranean , as Prime Minister Winston Churchill put it . Devised by President Franklin Roosevelt and Churchill and code named Operation Torch , the plan was to occupy French North Africa : Morocco , Algeria , and Tunisia . From these French colonies , attacks could be launched that would drive Italy out of the war . + In July 1942 , Lieutenant General Dwight D. Eisenhower was appointed Allied Commander @-@ in @-@ Chief of Operation Torch . Churchill placed Gibraltar under the command of General Eisenhower as the temporary headquarters for this , the first large @-@ scale Anglo @-@ American operation of the war . He arrived in Gibraltar on 5 November 1942 to take over , not just command of Operation Torch itself , but also military command of Gibraltar . + General Eisenhower stayed at The Convent , the official Governor 's residence , but his operational headquarters were in a small chamber in a tunnel in the heart of the Rock . In his memoirs General Eisenhower wrote : + The subterranean passages under the Rock provided the sole available office space , and in them was located the signal equipment by which we expected to keep in touch with the commanders of the three assault forces . The eternal darkness of the tunnels was here and there partially pierced by feeble electric bulbs . Damp , cold air in block @-@ long passages was heavy with stagnation and did not noticeably respond to the clattering efforts of electric fans . Through the arched ceilings came a constant drip , drip , drip of surface water that faithfully but drearily ticked off the seconds of the interminable , almost unendurable , wait which always occurs between completion of a military plan and the moment action begins . + One hundred thousand soldiers on the high seas in a multitude of transports converged on Gibraltar . More than 400 aircraft of all types were crammed into the dispersal areas around the Gibraltar runway . Fighters had been shipped in crates and assembled on the airfield . Every available area of storage was taken up with ammunition , fuel , and other essential supplies . 168 American pilots were housed in the RAF messes at North Front . + On 8 November 1942 , 466 aircraft from Gibraltar landed on captured North African airfields . + From their headquarters in Gibraltar , General Eisenhower and Admiral Sir Andrew Browne Cunningham directed Operation Torch , the first major combined combat operation during World War II involving American and British forces . + + = = = War tunnels = = = + + Given that Gibraltar was a small town with only a few defences protecting it , the solution was to build a massive series of tunnels and chambers inside the natural protection of the Rock of Gibraltar . This " town " inside the Rock contained its own power station , water supply , and hospital . Some soldiers posted here would not see the light of day for months on end . Two Canadian engineer companies , the only soldiers with diamond @-@ tipped drills and 5 British engineer companies , added some 30 miles ( 48 km ) of such tunnels , a feat thought impossible at the time . That was enough to hold all 30 @,@ 000 troops on the rock . Today , the rock has more underground tunnels than roads . + + = = Death of Władysław Sikorski : 1943 = = + + On 4 July 1943 , a Liberator bomber from RAF Transport Command took off from Gibraltar for England . On board was General Władysław Sikorski , Prime Minister of Poland 's London @-@ based government in exile and Commander @-@ in @-@ Chief of its armed forces , returning from visiting Polish troops in the Middle East . + The aircraft climbed normally from the runway , levelled off to gather speed but then suddenly lost height and crashed into the harbour . The 62 @-@ year @-@ old general died , along with 15 others . The sole survivor was the Czech @-@ born pilot , Eduard Prchal , who was rescued by an RAF launch . The bodies of five passengers and crew , including Sikorski 's daughter , were never found . + The coffins of General Sikorski and his Chief @-@ of @-@ Staff , General Kilimecki , were draped in the Polish National Flag and lay in state in the Cathedral of St. Mary the Crowned . After a Requiem Mass , the bodies were carried in procession to the H.M. Dockyard with full Military Honours to be shipped to London in anticipation that General Sikorski 's remains would one day be returned to a liberated Poland . The route to the dockyard was lined by British troops and the coffins carried and escorted by Polish Servicemen . + + = = = Investigation = = = + + In 1943 a British Court of Inquiry investigated the crash of Sikorski 's Liberator II AL523 , but was unable to determine the probable cause , finding only that it was an accident and the " aircraft became uncontrollable for reasons which cannot be established " . A popular theory was insufficient technical maintenance leading to jamming aircraft controls . Despite this finding , the political context of the event , coupled with a variety of curious circumstances , immediately gave rise to speculation that Sikorski 's death had been no accident , and may in fact have been the direct result of a Soviet , British or even Polish conspiracy . + + = = Aftermath = = + + The surrender of Italy in September 1943 lifted any possible objections to the return of the evacuees to the Rock . As a result , a Resettlement Board was established in November , and at a meeting of the Board on 8 February 1944 repatriation priorities were finally agreed . On 6 April 1944 the first group of 1 @,@ 367 repatriates arrived on the Rock directly from the United Kingdom and on 28 May , the first repatriation party left Madeira , and by the end of 1944 only 520 non @-@ priority evacuees remained on the island . + In London , home @-@ comers were making claims on the evacuees ’ wartime accommodation and 500 Gibraltarians were re @-@ evacuated to Scotland and 3 @,@ 000 to camps in Northern Ireland . Although the Governor , Lt. General Sir Noel Mason @-@ MacFarlane , fought valiantly on behalf of the evacuees and did not accept the lack of accommodation as a sufficient reason for the delays . As late as 1947 there were still 2 @,@ 000 in Northern Irish camps . The last of the evacuees did not see the Rock again until 1951 . + + = = See Also = = + + Military history of the British Commonwealth in the Second World War + + + = Nerva = + + Nerva ( Latin : Marcus Cocceius Nerva Caesar Augustus ; 8 November , 30 AD – 27 January , 98 AD ) was Roman Emperor from 96 to 98 . Nerva became Emperor at the age of sixty @-@ five , after a lifetime of imperial service under Nero and the rulers of the Flavian dynasty . Under Nero , he was a member of the imperial entourage and played a vital part in exposing the Pisonian conspiracy of 65 . Later , as a loyalist to the Flavians , he attained consulships in 71 and 90 during the reigns of Vespasian and Domitian respectively . + On 18 September 96 , Domitian was assassinated in a palace conspiracy involving members of the Praetorian Guard and several of his freedmen . On the same day , Nerva was declared emperor by the Roman Senate . This was the first time the Senate elected a Roman Emperor . As the new ruler of the Roman Empire , he vowed to restore liberties which had been curtailed during the autocratic government of Domitian . + Nerva 's brief reign was marred by financial difficulties and his inability to assert his authority over the Roman army . A revolt by the Praetorian Guard in October 97 essentially forced him to adopt an heir . After some deliberation Nerva adopted Trajan , a young and popular general , as his successor . After barely fifteen months in office , Nerva died of natural causes on 27 January 98 . Upon his death he was succeeded and deified by Trajan . + Although much of his life remains obscure , Nerva was considered a wise and moderate emperor by ancient historians . Nerva 's greatest success was his ability to ensure a peaceful transition of power after his death , thus founding the Nerva – Antonine dynasty . + + = = Early career = = + + + = = = Family = = = + + Marcus Cocceius Nerva was born in the village of Narni , 50 kilometers north of Rome , to the family of Marcus Cocceius Nerva , Suffect Consul in 40 , and Sergia Plautilla . Ancient sources report the date as either 30 or 35 . He had at least one attested sister , named Cocceia , who married Lucius Salvius Titianus Otho , the brother of the future Emperor Otho . + Like Vespasian , the founder of the Flavian dynasty , Nerva was a member of the Italian nobility rather than one of the elite of Rome . Nevertheless , the Cocceii were among the most esteemed and prominent political families of the late Republic and early Empire , attaining consulships in each successive generation . The direct ancestors of Nerva on his father 's side , all named Marcus Cocceius Nerva , were associated with imperial circles since the time of Emperor Augustus ( 27 BC – AD 14 ) . + His great @-@ grandfather was Consul in 36 BC ( in replacement , and abdicated ) , and Governor of Asia in the same year . His grandfather became Consul Suffect in July of either 21 or 22 , and was known as a personal friend of Emperor Tiberius ( AD 14 – 37 ) , accompanying the emperor during his voluntary seclusion on Capri from 23 onwards , dying in 33 . Nerva 's father , finally , attained the consulship in 40 under emperor Caligula ( 37 – 41 ) . The Cocceii were connected with the Julio @-@ Claudian dynasty through the marriage of Sergia Plautilla 's brother Octavius Laenas , and Rubellia Bassa , the great @-@ granddaughter of Tiberius . + + = = = Imperial service = = = + + Not much of Nerva 's early life or career is recorded , but it appears he did not pursue the usual administrative or military career . He was praetor @-@ elect in the year 65 and , like his ancestors , moved in imperial circles as a skilled diplomat and strategist . As an advisor to Emperor Nero , he successfully helped detect and expose the Pisonian conspiracy of 65 . Exactly what his contribution to the investigation was is not known but his services must have been considerable , since they earned him rewards equal to those of Nero 's guard prefect Tigellinus . He received triumphal honors — which was usually reserved for military victories — and the right to have his statues placed throughout the palace . + According to the contemporary poet Martial , Nero also held Nerva 's literary abilities in high esteem , hailing him as the " Tibullus of our time " . Another prominent member of Nero 's entourage was Vespasian , an old and respected general who had celebrated military triumphs during the 40s . It appears Vespasian befriended Nerva during his time as an imperial advisor , and may have asked him to watch over Vespasian 's youngest son Domitian when Vespasian departed for the Jewish war in 67 . + The suicide of Nero on 9 June 68 brought the Julio @-@ Claudian dynasty to an end , leading to the chaotic Year of the Four Emperors , which saw the successive rise and fall of the emperors Galba , Otho and Vitellius , until the accession of Vespasian on 21 December 69 . Virtually nothing is known of Nerva 's whereabouts during 69 , but despite the fact that Otho was his brother @-@ in @-@ law , he appears to have been one of the earliest and strongest supporters of the Flavians . + For services unknown , he was rewarded with a consulship early in Vespasian 's reign in 71 . This was a remarkable honour , not only because he held this office early under the new regime , but also because it was an ordinary consulship ( instead of a less prestigious suffect consulship ) , making him one of the few non @-@ Flavians to be honoured in this way under Vespasian . After 71 Nerva again disappears from historical record , presumably continuing his career as an inconspicuous advisor under Vespasian ( 69 – 79 ) and his sons Titus ( 79 – 81 ) and Domitian ( 81 – 96 ) . + He re @-@ emerges during the revolt of Saturninus in 89 . On 1 January , 89 , the governor of Germania Superior , Lucius Antonius Saturninus , and his two legions at Mainz , Legio XIV Gemina and Legio XXI Rapax , revolted against the Roman Empire with the aid of a tribe of the Chatti . The governor of Germania Inferior , Lappius Maximus , moved to the region at once , assisted by the procurator of Rhaetia , Titus Flavius Norbanus . Within twenty @-@ four days the rebellion was crushed , and its leaders at Mainz savagely punished . The mutinous legions were sent to the front of Illyricum , while those who had assisted in their defeat were duly rewarded . + Domitian opened the year following the revolt by sharing the consulship with Nerva . Again , the honour suggested Nerva had played a part in uncovering the conspiracy , perhaps in a fashion similar to what he did during the Pisonian conspiracy under Nero . Alternatively , Domitian may have selected Nerva as his colleague to emphasise the stability and status @-@ quo of the regime . The revolt had been suppressed , and the Empire could return to order . + + = = Emperor = = + + + = = = Accession = = = + + On 18 September , 96 , Domitian was assassinated in a palace conspiracy organised by court officials . The Fasti Ostienses , the Ostian Calendar , records that the same day the Senate proclaimed Marcus Cocceius Nerva emperor . Despite his political experience , this was a remarkable choice . Nerva was old and childless , and had spent much of his career out of the public light , prompting both ancient and modern authors to speculate on his involvement in Domitian 's assassination . + According to Cassius Dio , the conspirators approached Nerva as a potential successor prior to the assassination , which indicates that he was at least aware of the plot . Suetonius by contrast does not mention Nerva , but he may have omitted his role out of tactfulness . Considering the works of Suetonius were published under Nerva 's direct descendants Trajan and Hadrian , it would have been less than sensitive of him to suggest the dynasty owed its accession to murder . On the other hand , Nerva lacked widespread support in the Empire , and as a known Flavian loyalist his track record would not have recommended him to the conspirators . The precise facts have been obscured by history , but modern historians believe Nerva was proclaimed Emperor solely on the initiative of the Senate , within hours after the news of the assassination broke . + Although he appeared to be an unlikely candidate on account of his age and weak health , Nerva was considered a safe choice precisely because he was old and childless . Furthermore , he had close connections with the Flavian dynasty and commanded the respect of a substantial part of the Senate . Nerva had seen the anarchy which had resulted from the death of Nero ; he knew that to hesitate even for a few hours could lead to violent civil conflict . Rather than decline the invitation and risk revolts , he accepted . The decision may have been hasty so as to avoid civil war , but neither the Senate nor Nerva appears to have been involved in the conspiracy against Domitian . + Following the accession of Nerva as emperor , the Senate passed damnatio memoriae on Domitian : his coins and statues were melted , his arches were torn down and his name was erased from all public records . In many instances , existing portraits of Domitian , such as those found on the Cancelleria Reliefs , were simply recarved to fit the likeness of Nerva . This allowed quick production of new images and recycling of previous material . In addition , the vast palace which Domitian had erected on the Palatine Hill , known as the Flavian Palace , was renamed the " House of the People " , and Nerva himself took up residence in Vespasian 's former villa in the Gardens of Sallust . + + = = = Administration = = = + + The change of government was welcome particularly to the senators , who had been harshly persecuted during Domitian 's reign . As an immediate gesture of goodwill towards his supporters , Nerva publicly swore that no senators would be put to death as long as he remained in office . He called an end to trials based on treason , released those who had been imprisoned under these charges , and granted amnesty to many who had been exiled . + All properties which had been confiscated by Domitian were returned to their respective families . Nerva also sought to involve the Senate in his government , but this was not entirely successful . He continued to rely largely on friends and advisors that were known and trusted , and by maintaining friendly relations with the pro @-@ Domitianic faction of the Senate , he incurred hostility which may have been the cause for at least one conspiracy against his life . + Having been proclaimed emperor solely on the initiative of the Senate , Nerva had to introduce a number of measures to gain support among the Roman populace . As was custom by this time , a change of emperor was expected to bring with it a generous payment of gifts and money to the people and the army . Accordingly , a congiarium of 75 denarii per head was bestowed upon the citizens , while the soldiers of the Praetorian Guard received a donativum which may have amounted to as much as 5000 denarii per person . This was followed by a string of economic reforms intended to alleviate the burden of taxation from the most needy Romans . + To the poorest , Nerva granted allotments of land worth up to 60 million sesterces . He exempted parents and their children from a 5 % inheritance tax , and he made loans to Italian landowners on the condition that they pay interest of 5 % to their municipality to support the children of needy families ; alimentary schemes which were later expanded by Trajan , Antoninus Pius , and Marcus Aurelius . Furthermore , numerous taxes were remitted and privileges granted to Roman provinces . Namely , he probably abolished the Fiscus Iudaicus , the additional tax which all Jews throughout the Empire had to pay : some of his coins bear the legend FISCI IUDAICI CALUMNIA SUBLATA ( abolition of malicious prosecution regarding the Jewish tax ) . + Before long , Nerva 's expenses strained the economy of Rome and , although perhaps not ruinous to the extent once suggested by Syme , necessitated the formation of a special commission of economy to drastically reduce expenditures . The most superfluous religious sacrifices , games and horse races were abolished , while new income was generated from Domitian 's former possessions , including the auctioning of ships , estates , and even furniture . Large amounts of money were obtained from Domitian 's silver and gold statues , and Nerva forbade that similar images be made in his honor . + Because he reigned only briefly , Nerva 's public works were few , instead completing projects which had been initiated under Flavian rule . This included extensive repairs to the Roman road system and the expansion of the aqueducts . The latter program was headed by the former consul Sextus Julius Frontinus , who helped to put an end to abuses and later published a significant work on Rome 's water supply , De Aquis Urbis Romae . The only major landmarks constructed under Nerva were a granary , known as the Horrea Nervae , and a small Imperial Forum begun by Domitian , which linked the Forum of Augustus to the Temple of Peace . Little remains , partly because the Via dei Fori Imperiali cuts across it . + + = = = Crisis of succession = = = + + Despite Nerva 's measures to remain popular with the Senate and the Roman people , support for Domitian remained strong in the army , which had called for his deification immediately after the assassination . In an attempt to appease the soldiers of the Praetorian Guard , Nerva had dismissed their prefect Titus Petronius Secundus — one of the chief conspirators against Domitian — and replaced him with a former commander , Casperius Aelianus . + Likewise , the generous donativum bestowed upon the soldiers following his accession was expected to swiftly silence any protests against the violent regime change . The Praetorians considered these measures insufficient , however , and demanded the execution of Domitian 's assassins , which Nerva refused . Continued dissatisfaction with this state of affairs would ultimately lead to the gravest crisis of Nerva 's reign . + While the swift transfer of power following Domitian 's death had prevented a civil war from erupting , Nerva 's position as an emperor soon proved too vulnerable , and his benign nature turned into a reluctance to assert his authority . Upon his accession , he had ordered a halt to treason trials , but at the same time allowed the prosecution of informers by the Senate to continue . This measure led to chaos , as everyone acted in his own interests while trying to settle scores with personal enemies , leading the consul Fronto to famously remark that Domitian 's tyranny was ultimately preferable to Nerva 's anarchy . Early in 97 , a conspiracy led by the senator Gaius Calpurnius Piso Crassus Frugi Licinianus failed , but once again Nerva refused to put the conspirators to death , much to the disapproval of the Senate . + The situation was further aggravated by the absence of a clear successor , made more pressing because of Nerva 's old age and sickness . He had no natural children of his own and only distant relatives , who were unsuited for political office . A successor would have to be chosen from among the governors or generals in the Empire and it appears that , by 97 , Nerva was considering to adopt Marcus Cornelius Nigrinus Curiatius Maternus , the powerful governor of Syria . This was covertly opposed by those who supported the more popular military commander Marcus Ulpius Traianus , commonly known as Trajan , a general of the armies at the German frontier . + In October 97 these tensions came to a head when the Praetorian Guard , led by Casperius Aelianus , laid siege to the Imperial Palace and took Nerva hostage . He was forced to submit to their demands , agreeing to hand over those responsible for Domitian 's death and even giving a speech thanking the rebellious Praetorians . Titus Petronius Secundus and Parthenius , Domitian 's former chamberlain , were sought out and killed . Nerva was unharmed in this assault , but his authority was damaged beyond repair . + He realized that his position was no longer tenable without the support of an heir who had the approval of both the army and the people . Shortly thereafter , he announced the adoption of Trajan as his successor , and with this decision all but abdicated . Trajan was formally bestowed with the title of Caesar and shared the consulship with Nerva in 98 : + Contrary to the view here popularized by Cassius Dio , however , Nerva had in fact little choice with regard to his successor . Faced with a major crisis , he desperately needed the support of a man who could restore his damaged reputation . The only candidate with sufficient military experience , consular ancestry , and connections was Trajan . Likewise , Edward Gibbon 's assertion that Nerva hereby established a tradition of succession through adoption among the Five Good Emperors has found little support among modern historians . + + = = Death and legacy = = + + On 1 January , 98 , at the start of his fourth consulship , Nerva suffered a stroke during a private audience . Shortly thereafter he was struck by a fever and died at his villa in the Gardens of Sallust , on 28 January . He was deified by the Senate , and his ashes were laid to rest in the Mausoleum of Augustus . + Nerva was succeeded without incident by his adopted son Trajan , who was greeted by the Roman populace with much enthusiasm . According to Pliny the Younger , Trajan dedicated a temple in honour of Nerva , yet no trace of it has ever been found ; nor was a commemorative series of coins for the Deified Nerva issued until ten years after his death . According to Cassius Dio , however , the Guard prefect responsible for the mutiny against Nerva , Casperius Aelianus , was ' dismissed ' upon Trajan 's accession . + Due to the lack of written sources on this period , much of Nerva 's life has remained obscure . The most substantial surviving account of the reign of Nerva was written by the 3rd @-@ century historian Cassius Dio . His Roman History , which spans nearly a millennium , from the arrival of Aeneas in Italy until the year 229 , was composed more than one hundred years after Nerva had died . Further details are added by an abridged biography from the Epitome de Caesaribus , a work alleged to have been authored by the 4th @-@ century historian Aurelius Victor . + A more comprehensive text , presumed to describe the life of Nerva in closer detail , is the Histories , by the contemporary historian Tacitus . The Histories is an account of the history of Rome covering three decades from the suicide of emperor Nero in 69 until the death of Domitian in 96 . Unfortunately , a substantial part of the work has been lost , with only the first five books covering the Year of the Four Emperors remaining . In the introduction to his biography of Gnaeus Julius Agricola however , Tacitus speaks highly of Nerva , describing his reign as " the dawn of a most happy age , [ when ] Nerva Caesar blended things once irreconcilable , sovereignty and freedom " . + The surviving histories speak equally positively of Nerva 's brief reign , although none offer a substantial commentary on his policies . Both Cassius Dio and Aurelius Victor emphasize his wisdom and moderation , with Dio commending his decision to adopt Trajan as his heir . These views were later popularized by the 18th @-@ century historian Edward Gibbon in his History of the Decline and Fall of the Roman Empire . Gibbon considered Nerva the first of the Five Good Emperors , five successive rulers under whom the Roman Empire " was governed by absolute power , under the guidance of wisdom and virtue " from 96 until 180 . Nevertheless , even Gibbon notes that , compared to his successors , Nerva may have lacked the necessary qualifications for a successful reign : + Modern history has expanded upon this sentiment , characterizing Nerva as a well @-@ intentioned but weak and ineffectual ruler . The Roman Senate enjoyed renewed liberties under his rule , but Nerva 's mismanagement of the state finances and lack of authority over the army ultimately brought Rome near the edge of a significant crisis . The mutiny led by Casperius Aelianus was never intended as a coup , but a calculated attempt to put pressure on the emperor . The adoption of Trajan expanded his power base with a respected , reliable general as his successor . Murison concludes that Nerva 's real talents were in fact ill @-@ suited to the emperorship : + His place in Roman history is therefore summarized as a necessary , if tumultuous stop @-@ gap before the Trajanic @-@ Antonine dynasties . It is a fact of irony that even the only major public work completed during his reign , the Forum of Nerva , ultimately became known as the Forum Transitorium , or transitional forum . + Two modern statues which commemorate Nerva can be found in towns associated with him . There is an equestrian statue in Gloucester , England , a town which was founded in his honour . It is at the entrance to Southgate Street . There is also a statue at his alleged birthplace , Narni in Italy , at Cocceio Nerva street . + + = = Nerva – Antonine family tree = = + + + = = In popular culture = = + + Nerva was played by Norman Wooland in the 1951 film Quo Vadis . + He was also played by Giuliano Gemma in the 1964 film Revolt of the Praetorians . + + = = = Secondary material = = = + + Narnia web links , International links ' , International links from Narnia.it web site + Wend , David ( 1998 ) . " Nerva ( 96 – 98 A.D. ) " . De Imperatoribus Romanis . Retrieved 2007 @-@ 08 @-@ 11 . + Pelham , Henry Francis ( 1911 ) . " Nerva , Marcus Cocceius " . In Chisholm , Hugh . Encyclopædia Britannica 19 ( 11th ed . ) . Cambridge University Press. pp. 393 – 394 . + + = The Hustler ( film ) = + + The Hustler is a 1961 American drama film directed by Robert Rossen from Walter Tevis 's 1959 novel of the same name , adapted for the screen by Rossen and Sidney Carroll . It tells the story of small @-@ time pool hustler " Fast Eddie " Felson and his desire to break into the " major league " of professional hustling and high @-@ stakes wagering by high @-@ rollers that follows it . He throws his raw talent and ambition up against the best player in the country ; seeking to best the legendary pool player " Minnesota Fats . " After initially losing to Fats and getting involved with unscrupulous manager Bert Gordon , Eddie returns to try again , but only after paying a terrible personal price . + The film was shot on location in New York City . It stars Paul Newman as " Fast " Eddie Felson , Jackie Gleason as Minnesota Fats , Piper Laurie as Sarah , and George C. Scott as Bert . + The Hustler was a major critical and popular success , gaining a reputation as a modern classic . Its exploration of winning , losing , and character garnered a number of major awards ; it is also credited with helping to spark a resurgence in the popularity of pool . Real @-@ life pool player Rudolf Wanderone , known at the time as " New York Fats " and " Chicago Fats " , claimed to be the real life inspiration for Gleason 's character , Minnesota Fats , and adopted the name as his own . + + = = Plot = = + + Small @-@ time pool hustler " Fast Eddie " Felson travels cross @-@ country with his partner Charlie to challenge the legendary player " Minnesota Fats " . Arriving at Fats ' home pool hall , Eddie declares he will win $ 10 @,@ 000 that night . Fats arrives and he and Eddie agree to play straight pool for $ 200 a game . After initially falling behind , Eddie surges back to being $ 1 @,@ 000 ahead and suggests raising the bet to $ 1 @,@ 000 a game ; Fats agrees . He sends out a runner , Preacher , to Johnny 's Bar , ostensibly for whiskey , but really to get professional gambler Bert Gordon to the hall . Eddie gets ahead $ 11 @,@ 000 and Charlie tries to convince him to quit , but Eddie insists the game will end only when Fats says it is over . Fats agrees to continue after Bert labels Eddie a " loser . " After 25 hours and an entire bottle of bourbon , Eddie is ahead over $ 18 @,@ 000 , but loses it all along with all but $ 200 of his original stake . At their hotel later , Eddie leaves half of the remaining stake with a sleeping Charlie and leaves . + Eddie stashes his belongings at the local bus terminal , where he meets Sarah Packard , an alcoholic who is supported by her father , attends college part @-@ time , and walks with a limp . He meets her again at a bar . They go back to her place but she refuses to let him in , saying he is " too hungry " . Eddie moves into a rooming house and starts hustling for small stakes . He finds Sarah again and this time she takes him in , but with reservations . Charlie finds Eddie at Sarah 's and tries to persuade him to go back out on the road . Eddie refuses and Charlie realizes he plans to challenge Fats again . Eddie realizes that Charlie held out his percentage and becomes enraged , believing that with that money he could have rebounded to beat Fats . Eddie dismisses Charlie as a scared old man and tells him to " go lie down and die " by himself . + At Johnny 's Bar , Eddie joins a poker game where Bert is playing , and loses $ 20 . Afterward , Bert tells Eddie that he has talent as a pool player but no character . He figures that Eddie will need at least $ 3 @,@ 000 to challenge Fats again . Bert calls him a " born loser " but nevertheless offers to stake him in return for 75 % of his winnings ; Eddie refuses . + Eddie humiliates a local pool shark , exposing himself as a hustler , and the other players punish him by breaking his thumbs . As he heals , Sarah cares for him and tells him she loves him , but he cannot say the words in return . When Eddie is ready to play , he agrees to Bert 's terms , deciding that a " 25 % slice of something big is better than a 100 % slice of nothing " . + Bert , Eddie , and Sarah travel to the Kentucky Derby , where Bert arranges a match for Eddie against a wealthy local socialite named Findley . The game turns out to be carom billiards , not pool . When Eddie loses badly , Bert refuses to keep staking him . Sarah pleads with Eddie to leave with her , saying that the world he is living in and its inhabitants are " perverted , twisted , and crippled " ; he refuses . Seeing Eddie 's anger , Bert agrees to let the match continue at $ 1 @,@ 000 a game . Eddie comes back to win $ 12 @,@ 000 . He collects his $ 3 @,@ 000 share and decides to walk back to the hotel . Bert arrives first and subjects Sarah to a humiliating sexual encounter . After , she scrawls " PERVERTED " , " TWISTED " , and " CRIPPLED " in lipstick on the bathroom mirror . Eddie arrives back at the hotel to learn that she has killed herself . + Eddie returns to challenge Fats again , putting up his entire $ 3 @,@ 000 stake on a single game . He wins game after game , beating Fats so badly that Fats is forced to quit . Bert demands a share of Eddie 's winnings and threatens that Eddie will be injured unless he pays . But Eddie says that if he is not killed he will kill Bert when he recovers ; invoking the memory of Sarah , he shames Bert into giving up his claim . Instead , Bert orders Eddie never to walk into a big @-@ time pool hall again . Eddie and Fats compliment each other as players , and Eddie walks out . + + = = Cast = = + + Cast notes + Pool champion Willie Mosconi has a cameo appearance as Willie , who holds the stakes for Eddie and Fats 's games . Mosconi 's hands also appear in many of the closeup shots . + + = = Production = = + + The Tevis novel had been optioned several times , including by Frank Sinatra , but attempts to adapt it for the screen were unsuccessful . Director Rossen 's daughter Carol Rossen speculates that previous adaptations focused too much on the pool aspects of the story and not enough on the human interaction . Rossen , who had hustled pool himself as a youth and who had made an abortive attempt to write a pool @-@ themed play called Corner Pocket , optioned the book and teamed with Sidney Carroll to produce the script . + According to Bobby Darin 's agent , Martin Baum , Paul Newman 's agent turned down the part of Fast Eddie . Newman was originally unavailable to play Fast Eddie regardless , being committed to star opposite Elizabeth Taylor in the film Two for the Seesaw . Rossen offered Darin the part after seeing him on The Mike Wallace Interview . When Taylor was forced to drop out of Seesaw because of shooting overruns on Cleopatra , Newman was freed up to take the role , which he accepted after reading just half of the script . No one associated with the production officially notified Darin or his representatives that he had been replaced ; they found out from a member of the public at a charity horse race . + Rossen filmed The Hustler over six weeks , entirely in New York City . Much of the action was filmed at two now @-@ defunct pool halls , McGirr 's and Ames Billiard Academy . Other shooting locations included a townhouse on East 82nd Street , which served as the Louisville home of Murray Hamilton 's character Findley , and the Manhattan Greyhound bus terminal . The film crew built a dining area that was so realistic that confused passengers sat there and waited to place their orders . Willie Mosconi served as technical advisor on the film and shot a number of the trick shots in place of the actors . All of Gleason 's shots were his own ; they were filmed in wide @-@ angle to emphasize having the actor and the shot in the same frames . Rossen , in pursuit of the style he termed " neo @-@ neo @-@ realistic " , hired actual street thugs , enrolled them in the Screen Actors Guild and used them as extras . Scenes that were included in the shooting script but did not make it into the final film include a scene at Ames pool hall establishing that Eddie is on his way to town ( originally slated to be the first scene of the film ) and a longer scene of Preacher talking to Bert at Johnny 's Bar which establishes Preacher is a junkie . + Early shooting put more focus on the pool playing , but during filming Rossen made the decision to place more emphasis on the love story between Newman and Laurie 's characters . Despite the change in emphasis , Rossen still used the various pool games to show the strengthening of Eddie 's character and the evolution of his relationship to Bert and Sarah , through the positioning of the characters in the frame . For example , when Eddie is playing Findley , Eddie is positioned below Bert in a two shot but above Findley while still below Bert in a three shot . When Sarah enters the room , she is below Eddie in two shot while in a three shot Eddie is still below Bert . When Eddie is kneeling over Sarah 's body , Bert again appears above him but Eddie attacks Bert , ending up on top of him . Eddie finally appears above Bert in two shot when Eddie returns to beat Fats . + + = = Themes = = + + The Hustler is fundamentally a story of what it means to be a human being , couched within the context of winning and losing . Describing the film , Robert Rossen said : " My protagonist , Fast Eddie , wants to become a great pool player , but the film is really about the obstacles he encounters in attempting to fulfill himself as a human being . He attains self @-@ awareness only after a terrible personal tragedy which he has caused — and then he wins his pool game . " Roger Ebert concurs with this assessment , citing The Hustler as " one of the few American movies in which the hero wins by surrendering , by accepting reality instead of his dreams . " + The film was also somewhat autobiographical for Rossen , relating to his dealings with the House Un @-@ American Activities Committee . A screenwriter during the 1930s and ' 40s , he had been involved with the Communist Party in the 1930s and refused to name names at his first HUAC appearance . Ultimately he changed his mind and identified friends and colleagues as party members . Similarly , Felson sells his soul and betrays the one person who really knows and loves him in a Faustian pact to gain character . + Film and theatre historian Ethan Mordden has identified The Hustler as one of a handful of films from the early 1960s that re @-@ defined the relationship of films to their audiences . This new relationship , he writes , is " one of challenge rather than flattery , of doubt rather than certainty . " No film of the 1950s , Mordden asserts , " took such a brutal , clear look at the ego @-@ affirmation of the one @-@ on @-@ one contest , at the inhumanity of the winner or the castrated vulnerability of the loser . " Although some have suggested the resemblance of this film to classic film noir , Mordden rejects the comparison based on Rossen 's ultra @-@ realistic style , also noting that the film lacks noir 's " Treacherous Woman or its relish in discovering crime among the bourgeoisie , hungry bank clerks and lusty wives . " Mordden does note that while Fast Eddie " has a slight fifties ring " , the character " makes a decisive break with the extraordinarily feeling tough guys of the ' rebel ' era ... [ b ] ut he does end up seeking out his emotions " and telling Bert that he is a loser because he 's dead inside . + + = = Reception = = + + The Hustler had its world premiere in Washington , D.C. on September 25 , 1961 . Prior to the premiere , Richard Burton hosted a midnight screening of the film for the casts of the season 's Broadway shows , which generated a great deal of positive word of mouth . Initially reluctant to publicize the film , 20th Century Fox responded by stepping up its promotional activities . + The film was well received by critics , although with the occasional caveat . Variety praised the performances of the entire main cast but felt that the " sordid aspects " of the story prevented the film from achieving the " goal of being pure entertainment . " Variety also felt the film was far too long . Stanley Kauffmann , writing for The New Republic , concurred in part with this assessment . Kauffmann strongly praised the principal cast , calling Newman " first @-@ rate " and writing that Scott 's was " his most credible performance to date . " Laurie , he writes , gives her part " movingly anguished touches " ( although he also mildly criticizes her for over @-@ reliance on Method acting ) . While he found that the script " strains hard to give an air of menace and criminality to the pool hall " and also declares it " full of pseudo @-@ meaning " , Kauffmann lauds Rossen 's " sure , economical " direction , especially in regard to Gleason who , he says , does not so much act as " [ pose ] for a number of pictures which are well arranged by Rossen . It is the best use of a manikin by a director since Kazan photographed Burl Ives as Big Daddy . " The New York Times , despite finding that the film " strays a bit " and that the romance between Newman and Laurie 's characters " seems a mite far @-@ fetched " , nonetheless found that The Hustler " speaks powerfully in a universal language that spellbinds and reveals bitter truths . " + The Hustler received nine Academy Award nominations . The film won two , for Best Art Direction @-@ Set Decoration , Black @-@ and @-@ White ( Harry Horner and Gene Callahan ) and Best Cinematography , Black @-@ and @-@ White ( Eugen Schüfftan ) . The film was also nominated for Best Picture and Newman was nominated for Best Actor in a Leading Role . Gleason and Scott were both nominated for Best Actor in a Supporting Role ; Scott refused the nomination . Laurie was nominated for Best Actress in a Leading Role . Rossen received nominations for Best Director and , with Carroll , for Best Writing , Screenplay Based on Material from Another Medium . + Newman was nominated for a Golden Globe Award for Best Actor . Gleason and Scott were each nominated for Best Supporting Actor and Scott was also nominated as Best New Star of the Year . At the 1962 BAFTA Awards , The Hustler tied with the Soviet film Ballad of a Soldier for Best Film from Any Source . Newman won for Best Foreign Actor and Piper Laurie was nominated for Best Foreign Actress . Gleason was honored as Best Supporting Actor by the National Board of Review of Motion Pictures and the film was named among the Board 's ten best films of 1961 . Rossen was named Best Director by the New York Film Critics Circle Awards and Rossen and Carroll shared the Writers Guild of America Award for Best Written Drama . + American Film Institute Lists + AFI 's 100 Years ... 100 Movies - Nominated + AFI 's 100 Years ... 100 Thrills - Nominated + AFI 's 100 Years ... 100 Heroes and Villains : + Bert Gordon - Nominated Villain + AFI 's 100 Years ... 100 Movie Quotes : + " Eddie , you 're a born loser . " - Nominated + AFI 's 100 Years ... 100 Movies ( 10th Anniversary Edition ) - Nominated + AFI 's 10 Top 10 - # 6 Sports Film + + = = Legacy = = + + In the decades since its release , The Hustler has cemented its reputation as a classic . Roger Ebert , echoing earlier praise for the performances , direction , and cinematography and adding laurels for editor Dede Allen , cites the film as " one of those films where scenes have such psychic weight that they grow in our memories . " He further cites Fast Eddie Felson as one of " only a handful of movie characters so real that the audience refers to them as touchstones . " TV Guide calls the film a " dark stunner " offering " a grim world whose only bright spot is the top of the pool table , yet [ with ] characters [ who ] maintain a shabby nobility and grace . " The four leads are again lavishly praised for their performances and the film is summed up as " not to be missed . " + Paul Newman reprised his role as Fast Eddie Felson in the 1986 film The Color of Money , for which he won the Academy Award for Best Actor in a Leading Role . A number of observers and critics have suggested that this Oscar was in belated recognition for his performance in The Hustler . In 1997 , the Library of Congress selected The Hustler for preservation in the United States National Film Registry as " culturally , historically , or aesthetically significant . " Carroll and Rossen 's screenplay was selected by the Writers Guild of America in 2006 as the 96th best motion picture screenplay of all time . In June 2008 , AFI released its " Ten top Ten " — the best ten films in ten " classic " American film genres — after polling over 1 @,@ 500 people from the creative community . The Hustler was acknowledged as the sixth best film in the sports genre . + The Hustler is credited with sparking a resurgence in the popularity of pool in the United States , which had been on the decline for decades . The film also brought recognition to Willie Mosconi , who , despite having won multiple world championships , was virtually unknown to the general public . Perhaps the greatest beneficiary of the film 's popularity was a real @-@ life pool hustler named Rudolf Wanderone . Mosconi claimed in an interview at the time of the film 's release that the character of Minnesota Fats was based on Wanderone , who at the time was known as " New York Fatty " . Wanderone immediately adopted the Minnesota Fats nickname and parlayed his association with the film into book and television deals and other ventures . Author Walter Tevis denied for the rest of his life that Wanderone had played any role in the creation of the character . Other players would claim , with greater or lesser degrees of credibility , to have served as models for Fast Eddie , including Ronnie Allen , Ed Taylor , Ed Parker , and Eddie Pelkey . + From ce7078d7869db9423b188a599d2d4b8e124775db Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Tue, 19 Mar 2024 18:56:02 +0000 Subject: [PATCH 147/159] Fix triton build condition --- Dockerfile.rocm | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index a143f37ab4f2f..b82e4e99fe109 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -84,15 +84,14 @@ RUN if [ "$BUILD_CUPY" = "1" ]; then \ fi # build triton -RUN if [ "$BUILD_TRITON" = "1"]; then \ +RUN if [ "$BUILD_TRITON" = "1" ]; then \ mkdir -p libs \ && cd libs \ && pip uninstall -y triton \ && git clone https://github.com/ROCm/triton.git \ && cd triton/python \ && pip3 install -e . \ - && cd ../.. \ - && rm -r triton; \ + && cd ../..; \ fi COPY ./ /app/vllm From 0e63661b0cdac0c60857f7cc277819c5b88ed2f6 Mon Sep 17 00:00:00 2001 From: jpvillam Date: Tue, 19 Mar 2024 11:49:21 -0400 Subject: [PATCH 148/159] Small fix on dockerfile --- Dockerfile.rocm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index a7640f6841ad9..080e5b04d28bc 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -99,11 +99,11 @@ RUN if [ "$BUILD_CUPY" = "1" ]; then \ fi # build triton -RUN if [ "$BUILD_TRITON" = "1"]; then \ +RUN if [ "$BUILD_TRITON" = "1" ]; then \ mkdir -p libs \ && cd libs \ && pip uninstall -y triton \ - && git clone https://github.com/ROCmSoftwarePlatform/triton.git + && git clone https://github.com/ROCmSoftwarePlatform/triton.git \ && cd triton/python \ && pip3 install -e . \ && cd ../..; \ From c45547bc8b27e8af19b32e2ab71386969f435f7f Mon Sep 17 00:00:00 2001 From: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com> Date: Tue, 19 Mar 2024 16:07:32 -0500 Subject: [PATCH 149/159] Update description of measure_ppl_MC_small.py Added invocation examples into the description. --- benchmarks/measure_ppl_MC_small.py | 32 ++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/benchmarks/measure_ppl_MC_small.py b/benchmarks/measure_ppl_MC_small.py index 2b344058d056b..f1301776d6b27 100755 --- a/benchmarks/measure_ppl_MC_small.py +++ b/benchmarks/measure_ppl_MC_small.py @@ -1,13 +1,29 @@ #!/usr/bin/env python3 -# -# This is a quick hack that produces PPL measurement by -# iteratively dumping the logprob vector for the single next symbol -# that is to be generated over the preloaded context. -# It is actually an *inefficient* procedure because for the -# N-token string it takes N*(preload + generation) time instead of -# preload + N*generation -# +""" +This is a quick hack that produces PPL measurement by +iteratively dumping the logprob vector for the single next symbol +that is to be generated over the preloaded context. + +It is actually an *inefficient* procedure because for the +N-token string it takes N*(preload + generation) time instead of +preload + N*generation + +Quick correctness validation tips: + +Running llama-2-7b model +( ./vllm/benchmarks/measure_ppl_MC_small.py --model=/data/models/llama-2-7b-chat-hf --data=./vllm/tests/prompts/wiki.test.raw --context-size=2048 --batch-size=1 -tp=1 ) +should result in PPL~6.447469639345 + +Running llama-2-13b model +( ./vllm/benchmarks/measure_ppl_MC_small.py --model=/data/models/llama-2-137b-chat-hf --data=./vllm/tests/prompts/wiki.test.raw --context-size=2048 --batch-size=1 -tp=1 ) +should result in PPL~5.675290252052 + +Running llama-2-70b model +( ./vllm/benchmarks/measure_ppl_MC_small.py --model=/data/models/llama-2-70b-chat-hf --data=./vllm/tests/prompts/wiki.test.raw --context-size=2048 --batch-size=1 -tp=1 ) +should result in PPL~4.2067624908705 + +""" import numpy as np from transformers import LlamaForCausalLM, LlamaTokenizer From d4cb905dfec5abdbbe9f585ff4ee9da59efedbfe Mon Sep 17 00:00:00 2001 From: jpvillam Date: Tue, 19 Mar 2024 19:41:43 -0400 Subject: [PATCH 150/159] Rebase updates and PR review changes Added Flag for controlling triton vs default flow. More small changes to dockerfile --- Dockerfile.rocm | 2 +- .../layers/attention/attention.py | 47 ++++++++++++------- .../layers/attention/backends/flash_attn.py | 36 +++++++++----- .../attention/ops/flash_attention_triton.py | 33 ++++++------- 4 files changed, 70 insertions(+), 48 deletions(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 080e5b04d28bc..e7f52307a6aa2 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -105,7 +105,7 @@ RUN if [ "$BUILD_TRITON" = "1" ]; then \ && pip uninstall -y triton \ && git clone https://github.com/ROCmSoftwarePlatform/triton.git \ && cd triton/python \ - && pip3 install -e . \ + && pip3 install . \ && cd ../..; \ fi diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py index 4b63b9eaf59a7..89b5816f7a47a 100644 --- a/vllm/model_executor/layers/attention/attention.py +++ b/vllm/model_executor/layers/attention/attention.py @@ -8,6 +8,7 @@ from vllm.logger import init_logger from vllm.model_executor.input_metadata import InputMetadata from vllm.utils import is_hip +import os logger = init_logger(__name__) @@ -34,11 +35,12 @@ def __init__( sliding_window: Optional[int] = None, ) -> None: super().__init__() - if _use_flash_attn(): + if use_triton := _use_flash_attn(): from vllm.model_executor.layers.attention.backends.flash_attn import FlashAttentionBackend # noqa: E501 self.backend = FlashAttentionBackend(num_heads, head_size, scale, num_kv_heads, alibi_slopes, - sliding_window) + sliding_window, + use_triton == 2) else: from vllm.model_executor.layers.attention.backends.xformers import XFormersBackend # noqa: E501 self.backend = XFormersBackend(num_heads, head_size, scale, @@ -59,26 +61,37 @@ def forward( @lru_cache(maxsize=1) -def _use_flash_attn() -> bool: - try: - import flash_attn # noqa: F401 - except ImportError: - logger.info("flash_attn is not found. Using xformers backend.") - return False - - if is_hip(): - # AMD GPUs. - return False - if torch.cuda.get_device_capability()[0] < 8: +def _use_flash_attn() -> int: + """Returns if and which flash attention to use. + + Returns: + int: 0 for none, 1 for default implementation, 2 for triton implementation. + """ + if not (os.environ.get('VLLM_USE_FLASH_ATTN_TRITON') and is_hip()): + # AMD GPUs can use flash_attn package or triton impl. + try: + import flash_attn # noqa: F401 + except ImportError: + logger.info("flash_attn is not found. Using xformers backend.") + return 0 + + if (not is_hip()) and torch.cuda.get_device_capability()[0] < 8: # Volta and Turing NVIDIA GPUs. logger.info("flash_attn is not supported on Turing or older GPUs. " "Using xformers backend.") - return False + return 0 + + if is_hip() and torch.cuda.get_device_capability()[0] != 9: + # not Instinct series GPUs. + logger.info("flash_atten is not supported on NAVI GPUs. " + "Using xformers backend.") + return 0 + if torch.get_default_dtype() not in (torch.float16, torch.bfloat16): logger.info( "flash_attn only supports torch.float16 or torch.bfloat16. " "Using xformers backend.") - return False + return 0 - logger.info("Using flash_attn backend.") - return True + logger.info(f"Using {'Triton' if os.environ.get('VLLM_USE_FLASH_ATTN_TRITON') else ''} flash_attn backend.") + return 2 if os.environ.get('VLLM_USE_FLASH_ATTN_TRITON') else 1 diff --git a/vllm/model_executor/layers/attention/backends/flash_attn.py b/vllm/model_executor/layers/attention/backends/flash_attn.py index c2d7b5acc467e..726b42cad9e3f 100644 --- a/vllm/model_executor/layers/attention/backends/flash_attn.py +++ b/vllm/model_executor/layers/attention/backends/flash_attn.py @@ -8,7 +8,7 @@ from vllm.model_executor.input_metadata import InputMetadata from vllm.model_executor.layers.attention.ops.paged_attn import ( PagedAttentionImpl) -from vllm.model_executor.layers.attention.ops.flash_attention_triton import attention +from vllm.model_executor.layers.attention.ops.flash_attention_triton import triton_attention class FlashAttentionBackend: @@ -21,6 +21,7 @@ def __init__( num_kv_heads: Optional[int] = None, alibi_slopes: Optional[List[float]] = None, sliding_window: Optional[int] = None, + use_triton: Optional[bool] = False, ) -> None: self.num_heads = num_heads self.head_size = head_size @@ -30,6 +31,7 @@ def __init__( if alibi_slopes is not None: alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) self.alibi_slopes = alibi_slopes + self.use_triton = use_triton assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads @@ -87,8 +89,8 @@ def forward( query = query.unflatten(0, (batch_size, seq_len)) key = key.unflatten(0, (batch_size, seq_len)) value = value.unflatten(0, (batch_size, seq_len)) - if is_hip(): - output, _ = attention( + if self.use_triton: + output, _ = triton_attention( query, key, value, @@ -98,15 +100,25 @@ def forward( self.scale, ) else: - output = flash_attn_func( - query, - key, - value, - softmax_scale=self.scale, - causal=True, - window_size=self.sliding_window, - alibi_slopes=self.alibi_slopes, - ) + if is_hip(): + #XXX: window_size and alibi_slopes not supported + output = flash_attn_func( + query, + key, + value, + softmax_scale=self.scale, + causal=True, + ) + else: + output = flash_attn_func( + query, + key, + value, + softmax_scale=self.scale, + causal=True, + window_size=self.sliding_window, + alibi_slopes=self.alibi_slopes, + ) else: # prefix-enabled attention output = PagedAttentionImpl.forward_prefix( diff --git a/vllm/model_executor/layers/attention/ops/flash_attention_triton.py b/vllm/model_executor/layers/attention/ops/flash_attention_triton.py index 37c15e0e6fa36..80962e4cf9d9a 100644 --- a/vllm/model_executor/layers/attention/ops/flash_attention_triton.py +++ b/vllm/model_executor/layers/attention/ops/flash_attention_triton.py @@ -251,12 +251,12 @@ def attn_fwd( ) acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=Out.type.element_ty) # We still need to write 0s to the result - tl.store(O_block_ptr, acc.to(Out.type.element_ty), boundary_check=(0,1)) - l_ptrs = L + off_z * hq * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m + #tl.store(O_block_ptr, acc.to(Out.type.element_ty), boundary_check=(0,1)) + #l_ptrs = L + off_z * hq * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m # We store inf to LSE, not -inf because in the bwd pass, we subtract this # from qk which makes it -inf, such that exp(qk - inf) = 0 for these masked blocks. - l = tl.full([BLOCK_M], value=float("inf"), dtype=tl.float32) - tl.store(l_ptrs, l) + #l = tl.full([BLOCK_M], value=float("inf"), dtype=tl.float32) + #tl.store(l_ptrs, l) # TODO: Should dropout and return encoded softmax be handled here too? return @@ -417,17 +417,17 @@ def attn_fwd( z = 0.0 acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty)) # write back LSE - l_ptrs = L + off_z * hq * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m + #l_ptrs = L + off_z * hq * MAX_SEQLENS_Q + off_h_q * MAX_SEQLENS_Q + offs_m # If seqlen_q not multiple of BLOCK_M, we need to mask out the last few rows. # This is only true for the last M block. For others, overflow_size will be -ve - overflow_size = end_m_idx - seqlen_q - if overflow_size > 0: - boundary = tl.full((BLOCK_M,), BLOCK_M - overflow_size, dtype=tl.int32) - # This is a > check because mask being 0 blocks the store. - l_ptrs_mask = boundary > tl.arange(0, BLOCK_M) - tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask) - else: - tl.store(l_ptrs, m_i + tl.math.log2(l_i)) + #overflow_size = end_m_idx - seqlen_q + #if overflow_size > 0: + # boundary = tl.full((BLOCK_M,), BLOCK_M - overflow_size, dtype=tl.int32) + # # This is a > check because mask being 0 blocks the store. + # l_ptrs_mask = boundary > tl.arange(0, BLOCK_M) + # tl.store(l_ptrs, m_i + tl.math.log2(l_i), mask=l_ptrs_mask) + #else: + # tl.store(l_ptrs, m_i + tl.math.log2(l_i)) # write back O o_offset = off_z * stride_oz + cu_seqlens_q_start * stride_om + off_h_q * stride_oh @@ -494,8 +494,6 @@ def forward(ctx, q, k, v, o, metadata, causal=False, sm_scale=1.0, bias=None): encoded_softmax = None - M = torch.empty((batch, nheads_q, metadata.max_seq_len), device=q.device, dtype=torch.float32) - # Seed the RNG so we get reproducible results for testing. philox_seed = 0x1BF52 philox_offset = 0x1D4B42 @@ -507,7 +505,7 @@ def forward(ctx, q, k, v, o, metadata, causal=False, sm_scale=1.0, bias=None): bias_strides = (0,0,0,0) attn_fwd[grid]( - q, k, v, bias, sm_scale, M, o, + q, k, v, bias, sm_scale, None, o, *q_strides, *k_strides, *v_strides, *o_strides, *bias_strides, None, None, dropout_p=0.0, @@ -526,7 +524,6 @@ def forward(ctx, q, k, v, o, metadata, causal=False, sm_scale=1.0, bias=None): RETURN_ENCODED_SOFTMAX=False ) - ctx.save_for_backward(q, k, v, o, M) ctx.grid = grid ctx.sm_scale = sm_scale ctx.BLOCK_DMODEL = head_size @@ -538,4 +535,4 @@ def forward(ctx, q, k, v, o, metadata, causal=False, sm_scale=1.0, bias=None): ctx.return_encoded_softmax = False return o, encoded_softmax -attention = _attention.apply +triton_attention = _attention.apply From 9d96fdb62a0a5f81873798b8bceb59e78691b9de Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Wed, 20 Mar 2024 23:47:48 +0000 Subject: [PATCH 151/159] Introducing torchrun multi GPU support --- benchmarks/benchmark_latency.py | 6 + benchmarks/benchmark_throughput.py | 11 +- csrc/hip_compat.h | 39 ++++++ vllm/config.py | 4 +- vllm/engine/arg_utils.py | 6 + vllm/engine/llm_engine.py | 3 + vllm/executor/torchrun_gpu_executor.py | 173 +++++++++++++++++++++++++ vllm/worker/worker.py | 19 ++- 8 files changed, 252 insertions(+), 9 deletions(-) create mode 100644 csrc/hip_compat.h create mode 100644 vllm/executor/torchrun_gpu_executor.py diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 2fdc08c5c26df..8ff04fccb0004 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -27,6 +27,7 @@ def main(args: argparse.Namespace): kv_cache_dtype=args.kv_cache_dtype, device=args.device, ray_workers_use_nsight=args.ray_workers_use_nsight, + worker_use_torchrun=args.worker_use_torchrun ) sampling_params = SamplingParams( @@ -151,5 +152,10 @@ def run_to_completion(profile_dir: Optional[str] = None): action='store_true', help="If specified, use nsight to profile ray workers", ) + parser.add_argument('--worker-use-torchrun', + action='store_true', + help='use torchrun instead of ray when using ' + 'more than 1 GPU. Preferable for ROCm' + ) args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 72bdc4b3b4540..00f9167d7e8e9 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -75,6 +75,7 @@ def run_vllm( device: str, enable_prefix_caching: bool, gpu_memory_utilization: float = 0.9, + worker_use_torchrun: bool = False, ) -> float: from vllm import LLM, SamplingParams llm = LLM(model=model, @@ -89,7 +90,8 @@ def run_vllm( enforce_eager=enforce_eager, kv_cache_dtype=kv_cache_dtype, device=device, - enable_prefix_caching=enable_prefix_caching) + enable_prefix_caching=enable_prefix_caching, + worker_use_torchrun=args.worker_use_torchrun,) # Add the requests to the engine. for prompt, _, output_len in requests: @@ -213,7 +215,8 @@ def main(args: argparse.Namespace): args.tensor_parallel_size, args.seed, args.n, args.use_beam_search, args.trust_remote_code, args.dtype, args.max_model_len, args.enforce_eager, args.kv_cache_dtype, args.device, - args.enable_prefix_caching, args.gpu_memory_utilization) + args.enable_prefix_caching, args.gpu_memory_utilization, + args.worker_use_torchrun) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -314,6 +317,10 @@ def main(args: argparse.Namespace): "--enable-prefix-caching", action='store_true', help="enable automatic prefix caching for vLLM backend.") + parser.add_argument('--worker-use-torchrun', + action='store_true', + help='use torchrun instead of ray when using ' + 'more than 1 GPU. Preferable for ROCm') args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/csrc/hip_compat.h b/csrc/hip_compat.h new file mode 100644 index 0000000000000..d9fe30b1e7b5d --- /dev/null +++ b/csrc/hip_compat.h @@ -0,0 +1,39 @@ +// !!! This is a file automatically generated by hipify!!! +#pragma once + +#ifdef USE_ROCM +#include +#endif + +#ifndef USE_ROCM + #define WARP_SIZE 32 +#else + #define WARP_SIZE warpSize +#endif + +#ifndef USE_ROCM + #define VLLM_LDG(arg) __ldg(arg) +#else + #define VLLM_LDG(arg) *(arg) +#endif + +#ifndef USE_ROCM + #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor_sync(uint32_t(-1), var, lane_mask) +#else + #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask) +#endif + +#ifndef USE_ROCM + #define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane) +#else + #define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane) +#endif + +#ifndef USE_ROCM + #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \ + hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL) +#else + #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \ + hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL) +#endif + diff --git a/vllm/config.py b/vllm/config.py index de687395a0001..bb01300f762a2 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -412,6 +412,7 @@ def __init__( pipeline_parallel_size: int, tensor_parallel_size: int, worker_use_ray: bool, + worker_use_torchrun: bool, max_parallel_loading_workers: Optional[int] = None, disable_custom_all_reduce: bool = False, ray_workers_use_nsight: bool = False, @@ -428,6 +429,7 @@ def __init__( else: self.tensor_parallel_size = tensor_parallel_size self.worker_use_ray = worker_use_ray + self.worker_use_torchrun = worker_use_torchrun self.max_parallel_loading_workers = max_parallel_loading_workers self.disable_custom_all_reduce = disable_custom_all_reduce self.ray_workers_use_nsight = ray_workers_use_nsight @@ -435,7 +437,7 @@ def __init__( self.world_size = pipeline_parallel_size * self.tensor_parallel_size # Ray worker is not supported for Neuron backend. - if self.world_size > 1 and not is_neuron(): + if not self.worker_use_torchrun and self.world_size > 1 and not is_neuron(): self.worker_use_ray = True self._verify_args() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c3dccdd5bb50b..557fb9c2d4061 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -21,6 +21,7 @@ class EngineArgs: seed: int = 0 max_model_len: Optional[int] = None worker_use_ray: bool = False + worker_use_torchrun: bool = False pipeline_parallel_size: int = 1 tensor_parallel_size: int = 1 max_parallel_loading_workers: Optional[int] = None @@ -153,6 +154,10 @@ def add_cli_args( action='store_true', help='use Ray for distributed serving, will be ' 'automatically set when using more than 1 GPU') + parser.add_argument('--worker-use-torchrun', + action='store_true', + help='use torchrun instead of ray when using ' + 'more than 1 GPU. Preferable for ROCm') parser.add_argument('--pipeline-parallel-size', '-pp', type=int, @@ -317,6 +322,7 @@ def create_engine_configs( parallel_config = ParallelConfig(self.pipeline_parallel_size, self.tensor_parallel_size, self.worker_use_ray, + self.worker_use_torchrun, self.max_parallel_loading_workers, self.disable_custom_all_reduce, self.ray_workers_use_nsight) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 4cdad4180aa14..e8272968d190d 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -126,6 +126,9 @@ def from_engine_args(cls, engine_args: EngineArgs) -> "LLMEngine": initialize_ray_cluster(parallel_config) from vllm.executor.ray_gpu_executor import RayGPUExecutor executor_class = RayGPUExecutor + elif parallel_config.worker_use_torchrun: + from vllm.executor.torchrun_gpu_executor import TorchrunGPUExecutor + executor_class = TorchrunGPUExecutor else: assert parallel_config.world_size == 1, ( "Ray is required if parallel_config.world_size > 1.") diff --git a/vllm/executor/torchrun_gpu_executor.py b/vllm/executor/torchrun_gpu_executor.py new file mode 100644 index 0000000000000..88823ba5d4920 --- /dev/null +++ b/vllm/executor/torchrun_gpu_executor.py @@ -0,0 +1,173 @@ +import importlib +import os +from typing import Dict, List, Optional + +from vllm.lora.request import LoRARequest +from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, + ParallelConfig, SchedulerConfig, LoRAConfig) +from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase +from vllm.executor.utils import check_block_size_valid +from vllm.logger import init_logger +from vllm.model_executor.parallel_utils.communication_op import broadcast_object_list, broadcast_tensor_dict +from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.utils import (get_ip, get_open_port, get_distributed_init_method, + make_async) + +logger = init_logger(__name__) + +# A map between the device type (in device config) to its worker module. +DEVICE_TO_WORKER_MODULE_MAP = { + "cuda": "vllm.worker.worker", + "neuron": "vllm.worker.neuron_worker", +} + + +class TorchrunGPUExecutor(ExecutorBase): + + def __init__( + self, + model_config: ModelConfig, + cache_config: CacheConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + lora_config: Optional[LoRAConfig], + ) -> None: + self.model_config = model_config + self.cache_config = cache_config + self.lora_config = lora_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + self.local_rank = int(os.getenv("LOCAL_RANK", "0")) + self.is_driver_worker = self.local_rank == 0 + + # Instantiate the worker and load the model to GPU. + self._init_worker() + + # Profile the memory usage and initialize the cache. + self._init_cache() + + def _dispatch_worker(self): + worker_module = DEVICE_TO_WORKER_MODULE_MAP[ + self.device_config.device_type] + imported_worker = importlib.import_module(worker_module) + Worker = imported_worker.Worker + return Worker + + def _init_worker(self): + # Lazy import the Worker to avoid importing torch.cuda/xformers + # before CUDA_VISIBLE_DEVICES is set in the Worker + Worker = self._dispatch_worker() + + assert self.parallel_config.world_size > 1, ( + "TorchrunGPUExecutor only supports multiple GPUs.") + + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + self.worker = Worker( + self.model_config, + self.parallel_config, + self.scheduler_config, + self.device_config, + local_rank=self.local_rank, + rank=self.local_rank, + distributed_init_method=distributed_init_method, + lora_config=self.lora_config, + kv_cache_dtype=self.cache_config.cache_dtype, + is_driver_worker=self.is_driver_worker, + ) + self.worker.init_model() + self.worker.load_model() + + def _init_cache(self) -> None: + """Profiles the memory usage and initializes the KV cache. + + The engine first profiles the existing memory usage. + Then, it allocates the remaining memory for KV blocks. + + .. tip:: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. + """ + # Get the maximum number of blocks that can be allocated on GPU and CPU. + num_gpu_blocks, num_cpu_blocks = ( + self.worker.profile_num_available_blocks( + block_size=self.cache_config.block_size, + gpu_memory_utilization=self.cache_config. + gpu_memory_utilization, + cpu_swap_space=self.cache_config.swap_space_bytes, + cache_dtype=self.cache_config.cache_dtype, + )) + + logger.info(f"# GPU blocks: {num_gpu_blocks}, " + f"# CPU blocks: {num_cpu_blocks}") + + check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, + self.model_config.max_model_len) + + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + + # Initialize the cache. + self.worker.init_cache_engine(cache_config=self.cache_config) + # Warm up the model. This includes capturing the model into CUDA graph + # if enforce_eager is False. + self.worker.warm_up_model() + + def execute_model(self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: + output = self.worker.execute_model( + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + ) + if self.is_driver_worker: + broadcast_object_list([output], src=0) + else: + res = [None] + broadcast_object_list(res, src=0) + output = res[0] + return output + + def add_lora(self, lora_request: LoRARequest) -> bool: + assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." + return self.worker.add_lora(lora_request) + + def remove_lora(self, lora_id: int) -> bool: + assert lora_id > 0, "lora_id must be greater than 0." + return self.worker.remove_lora(lora_id) + + def list_loras(self) -> List[int]: + return self.worker.list_loras() + + def check_health(self) -> None: + # TorchrunGPUExecutor will always be healthy as long as + # it's running. + return + + +class TorchrunGPUExecutorAsync(TorchrunGPUExecutor, ExecutorAsyncBase): + + async def execute_model_async( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + ) -> SamplerOutput: + output = await make_async(self.worker.execute_model)( + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy) + return output + + async def check_health_async(self) -> None: + # TorchrunGPUExecutor will always be healthy as long as + # it's running. + return diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 0dcd4018afa5f..6f446db6c1cda 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -268,12 +268,19 @@ def init_distributed_environment( "distributed_init_method must be set if torch.distributed " "is not already initialized") else: - torch.distributed.init_process_group( - backend="nccl", - world_size=parallel_config.world_size, - rank=rank, - init_method=distributed_init_method, - ) + if parallel_config.worker_use_torchrun: + torch.distributed.init_process_group( + backend="nccl", + world_size=parallel_config.world_size, + init_method="env://", + ) + else: + torch.distributed.init_process_group( + backend="nccl", + world_size=parallel_config.world_size, + rank=rank, + init_method=distributed_init_method, + ) if cupy_utils.is_initialized(): cupy_world_size = cupy_utils.get_world_size() From 51ce9f53e9cef0b84b77715ff1ae5708979a9fb5 Mon Sep 17 00:00:00 2001 From: charlifu Date: Thu, 21 Mar 2024 15:39:24 +0000 Subject: [PATCH 152/159] add use case for custom kernel for matvec operation --- vllm/model_executor/layers/linear.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 40e681df48f86..77e03aba573ad 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -13,6 +13,7 @@ divide, split_tensor_along_last_dim) from vllm.model_executor.utils import set_weight_attrs from vllm.logger import init_logger +from vllm import custom_ops logger = init_logger(__name__) @@ -72,6 +73,20 @@ def apply_weights(self, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: weight = weights["weight"] + if x.shape[0] == 1: + m, n, k = weight.shape[0], x.shape[0], x.shape[1] + out = torch.empty(x.shape[0], weight.shape[0], dtype=x.dtype) + if k == 8192 and (m == 1280 or m == 7168): + custom_ops.LLMM1(weight, x, out, 8) + elif k == 3584 and m == 8192: + custom_ops.LLMM1(weight, x, out, 8) + elif k <= 8192 and k % 8 == 0 and m % 4 == 0: + custom_ops.LLMM1(weight, x, out, 4) + else: + out = F.linear(x, weight) + if bias != None: + out = out + bias + return out if self.separate_bias_add: if bias is not None: return F.linear(x, weight) + bias From 47c560ebd2fc86017ff596396f20aec80401fb83 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Thu, 21 Mar 2024 16:14:48 +0000 Subject: [PATCH 153/159] Remove ignored file --- csrc/hip_compat.h | 39 --------------------------------------- 1 file changed, 39 deletions(-) delete mode 100644 csrc/hip_compat.h diff --git a/csrc/hip_compat.h b/csrc/hip_compat.h deleted file mode 100644 index d9fe30b1e7b5d..0000000000000 --- a/csrc/hip_compat.h +++ /dev/null @@ -1,39 +0,0 @@ -// !!! This is a file automatically generated by hipify!!! -#pragma once - -#ifdef USE_ROCM -#include -#endif - -#ifndef USE_ROCM - #define WARP_SIZE 32 -#else - #define WARP_SIZE warpSize -#endif - -#ifndef USE_ROCM - #define VLLM_LDG(arg) __ldg(arg) -#else - #define VLLM_LDG(arg) *(arg) -#endif - -#ifndef USE_ROCM - #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor_sync(uint32_t(-1), var, lane_mask) -#else - #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask) -#endif - -#ifndef USE_ROCM - #define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane) -#else - #define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane) -#endif - -#ifndef USE_ROCM - #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \ - hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL) -#else - #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \ - hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL) -#endif - From 97e197855fbcce75e6af2498851d366fcd408255 Mon Sep 17 00:00:00 2001 From: charlifu Date: Thu, 21 Mar 2024 16:27:34 +0000 Subject: [PATCH 154/159] limit the custom kernel under is_hip --- vllm/model_executor/layers/linear.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 77e03aba573ad..6398c53bfda06 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -13,6 +13,7 @@ divide, split_tensor_along_last_dim) from vllm.model_executor.utils import set_weight_attrs from vllm.logger import init_logger +from vllm.utils import is_hip from vllm import custom_ops logger = init_logger(__name__) @@ -73,7 +74,7 @@ def apply_weights(self, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: weight = weights["weight"] - if x.shape[0] == 1: + if is_hip() and x.shape[0] == 1: m, n, k = weight.shape[0], x.shape[0], x.shape[1] out = torch.empty(x.shape[0], weight.shape[0], dtype=x.dtype) if k == 8192 and (m == 1280 or m == 7168): From ed96036690843ccbe3d99fde4d8ddb717c8ddcf9 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Thu, 21 Mar 2024 17:55:28 +0000 Subject: [PATCH 155/159] Fix parameter --- benchmarks/benchmark_throughput.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 89c7e8dafe0ec..5b8d4419245c8 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -9,6 +9,7 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) from tqdm import tqdm +from torch.distributed import launch def sample_requests( @@ -93,7 +94,7 @@ def run_vllm( scales_path=scales_path, device=device, enable_prefix_caching=enable_prefix_caching, - worker_use_torchrun=args.worker_use_torchrun,) + worker_use_torchrun=worker_use_torchrun,) # Add the requests to the engine. for prompt, _, output_len in requests: From 42324b630f7ead5c826ab4ace39aa7638ba03693 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Thu, 21 Mar 2024 18:00:29 +0000 Subject: [PATCH 156/159] Unused import --- benchmarks/benchmark_throughput.py | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 5b8d4419245c8..95b59601f6b12 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -9,7 +9,6 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) from tqdm import tqdm -from torch.distributed import launch def sample_requests( From 9b1388c093a70577eaabb0e77c7fab1da9f03990 Mon Sep 17 00:00:00 2001 From: charlifu Date: Thu, 21 Mar 2024 21:00:56 +0000 Subject: [PATCH 157/159] fix custom kernel --- vllm/model_executor/layers/linear.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 6398c53bfda06..edcb448741f7e 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -74,17 +74,25 @@ def apply_weights(self, x: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor: weight = weights["weight"] - if is_hip() and x.shape[0] == 1: - m, n, k = weight.shape[0], x.shape[0], x.shape[1] - out = torch.empty(x.shape[0], weight.shape[0], dtype=x.dtype) + if is_hip() and x.view(-1, x.size(-1)).shape[0] == 1: + batched = False + if x.dim() == 3: + inp = x.view(-1, x.size(-1)) + batched = True + else: + inp = x + m, n, k = weight.shape[0], inp.shape[0], inp.shape[1] + out = torch.empty(inp.shape[0], weight.shape[0], dtype=inp.dtype, device='cuda') if k == 8192 and (m == 1280 or m == 7168): - custom_ops.LLMM1(weight, x, out, 8) + custom_ops.LLMM1(weight, inp, out, 8) elif k == 3584 and m == 8192: - custom_ops.LLMM1(weight, x, out, 8) + custom_ops.LLMM1(weight, inp, out, 8) elif k <= 8192 and k % 8 == 0 and m % 4 == 0: - custom_ops.LLMM1(weight, x, out, 4) + custom_ops.LLMM1(weight, inp, out, 4) else: - out = F.linear(x, weight) + out = F.linear(inp, weight) + if batched: + out = out.view(x.shape[0], x.shape[1], weight.shape[0]) if bias != None: out = out + bias return out From 6b186bb7b4f35d691e3a290a0f11c5c72ed35802 Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Thu, 21 Mar 2024 21:16:03 +0000 Subject: [PATCH 158/159] Refactor torchrun executor to reuse single gpu executor code --- vllm/executor/torchrun_gpu_executor.py | 89 ++++---------------------- vllm/model_executor/models/llama.py | 3 - 2 files changed, 13 insertions(+), 79 deletions(-) diff --git a/vllm/executor/torchrun_gpu_executor.py b/vllm/executor/torchrun_gpu_executor.py index 88823ba5d4920..837e6e9368e77 100644 --- a/vllm/executor/torchrun_gpu_executor.py +++ b/vllm/executor/torchrun_gpu_executor.py @@ -2,6 +2,7 @@ import os from typing import Dict, List, Optional +from vllm.executor.gpu_executor import GPUExecutor from vllm.lora.request import LoRARequest from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, ParallelConfig, SchedulerConfig, LoRAConfig) @@ -22,7 +23,7 @@ } -class TorchrunGPUExecutor(ExecutorBase): +class TorchrunGPUExecutor(GPUExecutor): def __init__( self, @@ -33,27 +34,15 @@ def __init__( device_config: DeviceConfig, lora_config: Optional[LoRAConfig], ) -> None: - self.model_config = model_config - self.cache_config = cache_config - self.lora_config = lora_config - self.parallel_config = parallel_config - self.scheduler_config = scheduler_config - self.device_config = device_config self.local_rank = int(os.getenv("LOCAL_RANK", "0")) self.is_driver_worker = self.local_rank == 0 + super().__init__(model_config, + cache_config, + parallel_config, + scheduler_config, + device_config, + lora_config) - # Instantiate the worker and load the model to GPU. - self._init_worker() - - # Profile the memory usage and initialize the cache. - self._init_cache() - - def _dispatch_worker(self): - worker_module = DEVICE_TO_WORKER_MODULE_MAP[ - self.device_config.device_type] - imported_worker = importlib.import_module(worker_module) - Worker = imported_worker.Worker - return Worker def _init_worker(self): # Lazy import the Worker to avoid importing torch.cuda/xformers @@ -65,7 +54,7 @@ def _init_worker(self): distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) - self.worker = Worker( + self.driver_worker = Worker( self.model_config, self.parallel_config, self.scheduler_config, @@ -77,50 +66,15 @@ def _init_worker(self): kv_cache_dtype=self.cache_config.cache_dtype, is_driver_worker=self.is_driver_worker, ) - self.worker.init_model() - self.worker.load_model() - - def _init_cache(self) -> None: - """Profiles the memory usage and initializes the KV cache. - - The engine first profiles the existing memory usage. - Then, it allocates the remaining memory for KV blocks. - - .. tip:: - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameter. - """ - # Get the maximum number of blocks that can be allocated on GPU and CPU. - num_gpu_blocks, num_cpu_blocks = ( - self.worker.profile_num_available_blocks( - block_size=self.cache_config.block_size, - gpu_memory_utilization=self.cache_config. - gpu_memory_utilization, - cpu_swap_space=self.cache_config.swap_space_bytes, - cache_dtype=self.cache_config.cache_dtype, - )) - - logger.info(f"# GPU blocks: {num_gpu_blocks}, " - f"# CPU blocks: {num_cpu_blocks}") - - check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, - self.model_config.max_model_len) - - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks - - # Initialize the cache. - self.worker.init_cache_engine(cache_config=self.cache_config) - # Warm up the model. This includes capturing the model into CUDA graph - # if enforce_eager is False. - self.worker.warm_up_model() + self.driver_worker.init_model() + self.driver_worker.load_model() def execute_model(self, seq_group_metadata_list: List[SequenceGroupMetadata], blocks_to_swap_in: Dict[int, int], blocks_to_swap_out: Dict[int, int], blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: - output = self.worker.execute_model( + output = self.driver_worker.execute_model( seq_group_metadata_list=seq_group_metadata_list, blocks_to_swap_in=blocks_to_swap_in, blocks_to_swap_out=blocks_to_swap_out, @@ -134,23 +88,6 @@ def execute_model(self, output = res[0] return output - def add_lora(self, lora_request: LoRARequest) -> bool: - assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." - return self.worker.add_lora(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - assert lora_id > 0, "lora_id must be greater than 0." - return self.worker.remove_lora(lora_id) - - def list_loras(self) -> List[int]: - return self.worker.list_loras() - - def check_health(self) -> None: - # TorchrunGPUExecutor will always be healthy as long as - # it's running. - return - - class TorchrunGPUExecutorAsync(TorchrunGPUExecutor, ExecutorAsyncBase): async def execute_model_async( @@ -160,7 +97,7 @@ async def execute_model_async( blocks_to_swap_out: Dict[int, int], blocks_to_copy: Dict[int, List[int]], ) -> SamplerOutput: - output = await make_async(self.worker.execute_model)( + output = await make_async(self.driver_worker.execute_model)( seq_group_metadata_list=seq_group_metadata_list, blocks_to_swap_in=blocks_to_swap_in, blocks_to_swap_out=blocks_to_swap_out, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 6eb645ca675d9..aff5369c74e6c 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -77,16 +77,13 @@ def __init__( self.act_fn = SiluAndMul() def forward(self, x): - #print(f'>>>Shape of x in mlp {x.shape} {self.gate_up_proj.weight.shape}') if x.shape[0] == 1 and x.shape[1] == 1: - out = torch.empty(x.shape[0],self.gate_up_proj.weight.shape[0]//2,dtype=x.dtype,device=x.device) custom_ops.LLMM_Silu(self.gate_up_proj.weight,x.view(-1,x.size(-1)),out,8) x = out.view(x.shape[0], x.shape[1], out.shape[1]) else: gate_up, _ = self.gate_up_proj(x) x = self.act_fn(gate_up) - #print(f'>>> x.shape {x.shape}') x, _ = self.down_proj(x) return x From 6ff02728d2dd400e787e92e866dfba1a89a9f4fc Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg Date: Fri, 22 Mar 2024 00:06:32 +0000 Subject: [PATCH 159/159] Fixed mixed up values --- csrc/attention/attention_kernels.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index 7005509094fec..f8841e12e77f2 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -815,11 +815,11 @@ template< int BLOCK_SIZE, bool IS_FP8_KV_CACHE, #ifdef USE_ROCM - int NUM_THREADS = 128, - int PARTITION_SIZE = 512> -#else int NUM_THREADS = 1024, int PARTITION_SIZE = 1024> +#else + int NUM_THREADS = 128, + int PARTITION_SIZE = 512> #endif void paged_attention_v2_launcher( torch::Tensor& out,