microsoft · lcy-seso · Jan 3, 2025 · Dec 11, 2024 · Dec 28, 2024 · Dec 29, 2024
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,19 @@
+{
+    "files.associations": {
+        "array": "cpp",
+        "string": "cpp",
+        "string_view": "cpp",
+        "span": "cpp",
+        "bitset": "cpp",
+        "initializer_list": "cpp",
+        "utility": "cpp",
+        "*.tcc": "cpp",
+        "chrono": "cpp",
+        "random": "cpp",
+        "limits": "cpp",
+        "semaphore": "cpp"
+    },
+    "gotoSymbolStack.currentStackPosition": 0,
+    "gotoSymbolStack.maxStackPosition": 0,
+    "gotoSymbolStack.filePositionInfo": []
+}
diff --git a/benchmarks/cpp/flashattention/CMakeLists.txt b/benchmarks/cpp/flashattention/CMakeLists.txt
@@ -0,0 +1,20 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved. Licensed under the
+# MIT License.
+# --------------------------------------------------------------------------
+
+cmake_minimum_required(VERSION 3.25 FATAL_ERROR)
+project(gemm_bench LANGUAGES C CXX CUDA)
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH}
+                      "${PROJECT_SOURCE_DIR}/../../../cmake")
+set(THIRD_PARTY_DIR "${PROJECT_SOURCE_DIR}/../../../3rd-party")
+
+include(generic)
+
+include_directories("${PROJECT_SOURCE_DIR}/../../../include")
+include_directories("${PROJECT_SOURCE_DIR}/../../utils/cpp")
+include_directories("${THIRD_PARTY_DIR}/cutlass/include")
+
+add_executable(flash_attn main.cu)
+target_link_libraries(flash_attn ${CUDA_CUBLAS_LIBRARIES})
diff --git a/benchmarks/cpp/flashattention/Makefile b/benchmarks/cpp/flashattention/Makefile
@@ -0,0 +1,16 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+
+
+BUILD_DIR := build
+
+.PHONY: build clean
+
+build:
+	@mkdir -p $(BUILD_DIR)
+	@cd $(BUILD_DIR) && cmake .. && make -j$(proc)
+
+clean:
+	@rm -rf $(BUILD_DIR)
diff --git a/benchmarks/cpp/flashattention/convert.cuh b/benchmarks/cpp/flashattention/convert.cuh
@@ -0,0 +1,68 @@
+#pragma once
+
+#include "cuda_utils.cuh"
+
+#include <cute/layout.hpp>
+#include <cute/tensor.hpp>
+#include <cutlass/numeric_conversion.h>
+
+namespace benchmarks {
+namespace cutlass_wrapper {
+
+using namespace cute;
+
+template <typename To_type, typename Engine, typename Layout>
+CUTE_DEVICE auto convert_type(cute::Tensor<Engine, Layout> const& tensor) {
+    using From_type = typename Engine::value_type;
+    constexpr int numel = decltype(size(tensor))::value;
+    cutlass::NumericArrayConverter<To_type, From_type, numel> convert_op;
+    auto frag =
+        convert_op(*reinterpret_cast<const cutlass::Array<From_type, numel>*>(
+            tensor.data()));
+    return make_tensor(make_rmem_ptr<To_type>(&frag), tensor.layout());
+}
+
+template <typename Layout>
+DEVICE auto convert_layout_rowcol_Aregs(Layout rowcol_layout) {
+    using namespace cute;
+    static_assert(decltype(size<0, 0>(rowcol_layout))::value == 2);
+    static_assert(decltype(size<1, 0>(rowcol_layout))::value == 2);
+    auto l = logical_divide(rowcol_layout,
+                            Shape<Underscore, Shape<Underscore, Int<2>>>{});
+
+    return make_layout(make_layout(get<0>(get<1>(l)), get<0>(get<0>(l)),
+                                   get<0>(get<1>(get<1>(l)))),
+                       get<1>(get<0>(l)), get<1>(get<1>(get<1>(l))));
+}
+
+DEVICE auto convert_layout_C_Aregs() {
+    using namespace cute;
+    auto layout_s = Layout<Shape<Shape<_2, _2>, _2, _16>>{};
+    auto l = logical_divide(layout_s, Shape<Underscore, Underscore, _2>{});
+
+    return make_layout(
+        make_layout(get<0>(get<0>(l)), get<1>(get<0>(l)), get<0>(get<2>(l))),
+        get<1>(l), get<1>(get<2>(l)));
+}
+
+template <class LayoutType>
+DEVICE auto convert_layout_scores(LayoutType layout_s) {
+    using namespace cute;
+    static_assert(decltype(size<0>(layout_s))::value == 4);
+    static_assert(decltype(rank(layout_s))::value == 3);
+
+    auto l = logical_divide(layout_s, Shape<_2>{});
+    return make_layout(make_layout(get<1>(get<0>(l)), get<1>(l)),
+                       make_layout(get<0>(get<0>(l)), get<2>(l)));
+}
+
+template <int ATOMNUM, class LayoutType>
+DEVICE auto convert_layout_scores_copyview(LayoutType layout_s) {
+    using namespace cute;
+
+    auto l = logical_divide(layout_s, Shape<Underscore, Int<ATOMNUM>>{});
+    return make_layout(get<0>(get<1>(l)), get<0>(l), get<1>(get<1>(l)));
+}
+
+}  // namespace cutlass_wrapper
+}  // namespace benchmarks