Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,12 @@ if(NOT GPU_ARCHS AND USER_GPU_TARGETS AND NOT MIOPEN_REQ_LIBS_ONLY)
PACKAGE_NAME examples
)
add_subdirectory(example)

add_subdirectory(tutorial)
rocm_package_setup_component(tutorials
LIBRARY_NAME composablekernel
PACKAGE_NAME tutorials
)
add_subdirectory(tile_engine)
if(BUILD_TESTING)
add_subdirectory(test)
Expand Down
1 change: 0 additions & 1 deletion example/ck_tile/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,5 @@ add_subdirectory(22_gemm_multi_abd)
add_subdirectory(35_batched_transpose)
add_subdirectory(36_pooling)
add_subdirectory(38_block_scale_gemm)
add_subdirectory(39_copy)
add_subdirectory(40_streamk_gemm)
add_subdirectory(41_batched_contraction)
15 changes: 15 additions & 0 deletions tutorial/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
include_directories(BEFORE
${PROJECT_SOURCE_DIR}/include
${PROJECT_SOURCE_DIR}/library/include
)

message(STATUS "Building tutorials...")
add_custom_target(tutorials)

# add all tutorial subdir
file(GLOB dir_list LIST_DIRECTORIES true *)
FOREACH(subdir ${dir_list})
if(IS_DIRECTORY "${subdir}" AND EXISTS "${subdir}/CMakeLists.txt")
add_subdirectory(${subdir})
ENDIF()
ENDFOREACH()
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
add_executable(tile_example_copy EXCLUDE_FROM_ALL copy_basic.cpp)
add_executable(tile_tutorial_copy_kernel EXCLUDE_FROM_ALL copy_basic.cpp)

# Impact: This flag ensures that the compiler doesn't make
# assumptions about memory aliasing that could interfere with Composable Kernel's explicit memory access patterns.
target_compile_options(tile_example_copy PRIVATE
target_compile_options(tile_tutorial_copy_kernel PRIVATE
-mllvm -enable-noalias-to-md-conversion=0
)

add_dependencies(tutorials tile_tutorial_copy_kernel)
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

set -euo pipefail

BIN="${BIN:-../../../build/bin/tile_example_copy}"
BIN="${BIN:-../../../build/bin/tile_tutorial_copy_kernel}"
WARMUP="${WARMUP:-20}"
REPEAT="${REPEAT:-100}"
VALIDATE="${VALIDATE:-1}"
Expand Down
7 changes: 7 additions & 0 deletions tutorial/ck_tile/01_naive_gemm/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
add_executable(tile_tutorial_naive_gemm EXCLUDE_FROM_ALL practice_gemm.cpp)

target_compile_options(tile_tutorial_naive_gemm PRIVATE
-mllvm -enable-noalias-to-md-conversion=0
)

add_dependencies(tutorials tile_tutorial_naive_gemm)
152 changes: 152 additions & 0 deletions tutorial/ck_tile/01_naive_gemm/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
# CK Tile Practice GEMM Example

This is a practice implementation of a GEMM (General Matrix Multiplication) kernel using the CK Tile API. It demonstrates the fundamental concepts of GPU kernel development using CK Tile's hierarchical tile system.

## CK Tile API Structure

In the composable_kernel library's ck_tile API, **A Kernel is composed of a Problem, a Policy and an Epilogue**:

1. **Problem** describes the shape, data type, data layout, precision of our GEMM matrices
2. **Policy** describes how the data in the matrix (or tile) is mapped to the threads
3. **Epilogue** describes additional computation work performed after the gemm computations (this example does not have an epilogue)

## Overview

This example implements a complete GEMM kernel `C = A × B` using the CK Tile framework, showcasing:

- **Problem Setup** - Setting up the problem (input/output shapes, data types, mathematical operations), composing a kernel (pipeline, policy, epilogue), kernel launch
- **Block-level Pipelining** - creating tensor views, dispatching to block-level GEMM
- **Block-level GEMM Computation** - Block tiles, tile window creation, loading/storing to DRAM and Register memory
- **Warp-level GEMM Computation** - Warp tiles, MFMA level computation

## Problem Setup and Data Flow

### Problem Size Configuration
We set the problem size using the M, N and K variables:
```cpp
ck_tile::index_t M = 1024; // Number of rows in A and C
ck_tile::index_t N = 512; // Number of columns in B and C
ck_tile::index_t K = 256; // Number of columns in A, rows in B
```

### Host Matrix Creation
Three host matrices A (M×K), B (N×K) and C (M×N) are created, initialized on the CPU and copied over to the GPU global/DRAM memory:
```cpp
// Host tensors with proper strides
ck_tile::HostTensor<ADataType> a_host(a_lengths, a_strides); // M × K
ck_tile::HostTensor<BDataType> b_host(b_lengths, b_strides); // N × K
ck_tile::HostTensor<CDataType> c_host(c_lengths, c_strides); // M × N

// Initialize with random data
ck_tile::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_host);
ck_tile::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_host);

// Allocate device memory and transfer data
ck_tile::DeviceMem a_device(a_host);
a_device.ToDevice(a_host.data());
```

### PracticeGemmShape Configuration
A PracticeGemmShape struct holds the dimension of each BlockTile and WaveTile:
![Tile Hierarchy](./images/tile_hierarchy.png)

```cpp
using BlockTile = ck_tile::sequence<256, 128, 32>; // M, N, K per block
using WaveTile = ck_tile::sequence<16, 16, 16>; // M, N, K per wave
```
- A BlockTile of size MxK (256x32) on A matrix and NxK (128x32) on B matrix. A WaveTile of size MxN (16x16) on C matrix.

![Tile Coverage](./images/gemm_loop.png)

- BlockTiles iterate in K dimension to fetch data required for computing region of C covered by C's block tile.
- BlockTiles are further subdivided into WarpTiles.
- WarpTiles over A and B similarly work together to calculate the WarpTile of C.

### Problem and Policy Composition
```cpp
// A Problem is composed from Shape and info about the data
using PracticeGemmHostProblem = ck_tile::
PracticeGemmHostProblem<ADataType, BDataType, CDataType, AccDataType, PracticeGemmShape>;

// A Policy is created describing data-to-thread mapping
using PracticeGemmHostPolicy = ck_tile::PracticeGemmHostPolicy;

// A Kernel is then composed of Problem and Policy
using gemm_kernel = ck_tile::PracticeGemmKernel<PracticeGemmHostProblem, PracticeGemmHostPolicy>;
```

### Kernel Launch
`ck_tile::launch_kernel()` is used to launch the kernel on device. It calls the `operator()` function of `PracticeGemmKernel{}`:
```cpp
float ave_time = ck_tile::launch_kernel(
ck_tile::stream_config{nullptr, true, 0, 0, 1},
ck_tile::make_kernel<kBlockSize, kBlockPerCU>(
gemm_kernel{}, // Kernel composed of Problem + Policy
kGridSize, // Grid dimensions
kBlockSize, // Block dimensions
0, // Dynamic shared memory
// Kernel arguments: device buffers and problem dimensions
a_device.GetDeviceBuffer(), b_device.GetDeviceBuffer(), c_device.GetDeviceBuffer(),
M, N, K, stride_a, stride_b, stride_c));
```

### Result Verification
The results from the kernel are compared with results from CPU based computation function:
```cpp
// CPU reference implementation
ck_tile::HostTensor<CDataType> c_host_ref(c_lengths, c_strides);
reference_basic_gemm<ADataType, BDataType, AccDataType, CDataType>(a_host, b_host, c_host_ref);

// Device results
ck_tile::HostTensor<CDataType> c_host_dev(c_lengths, c_strides);

// Verify correctness
bool pass = ck_tile::check_err(c_host_dev, c_host_ref);
```

### Runtime Flow

The main program (`practice_gemm.cpp`) is the entry point for the runtime flow:

```cpp
int main()
{
// 1. Define data types and problem sizes
using ADataType = ck_tile::half_t;
ck_tile::index_t M = 2048, N = 1024, K = 512;

// 2. Create host tensors and initialize
ck_tile::HostTensor<ADataType> a_host(a_lengths, a_strides);
ck_tile::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_host);

// 3. Allocate device memory and transfer data
ck_tile::DeviceMem a_device(a_host);

// 4. Configure tile shapes
using BlockTile = ck_tile::sequence<256, 128, 32>;
using WaveTile = ck_tile::sequence<16, 16, 16>;

// 5. Launch kernel
using gemm_kernel = ck_tile::PracticeGemmKernel<Problem, Policy>;
float ave_time = ck_tile::launch_kernel(/*...*/);

// 6. Verify results
bool pass = verify_results(a_host, b_host, c_host);

// 7. Print performance metrics
print_performance_metrics(ave_time, M, N, K);
}
```

## Building and Running

```bash
# From composable_kernel root directory
mkdir build && cd build
sh ../script/cmake-ck-dev.sh ../ <arch>
make tile_example_practice_gemm -j

# Run with sample sizes
./bin/tile_example_practice_gemm
```
This example serves as a foundation for understanding more complex GEMM implementations and optimization strategies in the CK Tile framework.
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.

#pragma once

#include "ck_tile/core.hpp"
#include "ck_tile/host.hpp"

namespace ck_tile {

template <typename Problem, typename Policy = PracticeGemmBlockPolicy>
struct PracticeGemmBlockPipelineAGmemBGmemCreg
{
using ADataType = typename Problem::ADataType;
using BDataType = typename Problem::BDataType;
using CDataType = typename Problem::CDataType;
using AccDataType = typename Problem::AccDataType;

using BlockTile = typename Problem::Shape::BlockTile;
using WaveTile = typename Problem::Shape::WaveTile;

static constexpr index_t MPerBlock = BlockTile::at(number<0>{});
static constexpr index_t NPerBlock = BlockTile::at(number<1>{});
static constexpr index_t KPerBlock = BlockTile::at(number<2>{});

static constexpr index_t MPerWave = WaveTile::at(number<0>{});
static constexpr index_t NPerWave = WaveTile::at(number<1>{});
static constexpr index_t KPerWave = WaveTile::at(number<2>{});

using BlockGemm =
remove_cvref_t<decltype(Policy::template GetPracticeWaveGemmPipeline<Problem>())>;

CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetStaticLDSSize()
{
return integer_divide_ceil(
sizeof(ADataType) *
Policy::template MakeALdsBlockDescriptor<Problem>().get_element_space_size(),
16) *
16 +
sizeof(BDataType) *
Policy::template MakeBLdsBlockDescriptor<Problem>().get_element_space_size();
}

template <typename ADramBlockWindowTmp, typename BDramBlockWindowTmp>
CK_TILE_HOST_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
const BDramBlockWindowTmp& b_dram_block_window_tmp,
index_t num_loop,
void* p_smem) const
{
static_assert(
std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
std::is_same_v<BDataType, remove_cvref_t<typename BDramBlockWindowTmp::DataType>>,
"wrong!");

static_assert(MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
NPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<1>{}],
"wrong!");

// -----------------------------------------------------------------------------------------
// Definitions of all needed tiles

// A tile in LDS
ADataType* p_a_lds = static_cast<ADataType*>(p_smem);

constexpr auto a_lds_block_desc = Policy::template MakeALdsBlockDescriptor<Problem>();

auto a_lds_block = make_tensor_view<address_space_enum::lds>(p_a_lds, a_lds_block_desc);

constexpr index_t a_lds_block_space_size_aligned =
integer_divide_ceil(sizeof(ADataType) * a_lds_block_desc.get_element_space_size(), 16) *
16;

// B tile in LDS
BDataType* p_b_lds = static_cast<BDataType*>(
static_cast<void*>(static_cast<char*>(p_smem) + a_lds_block_space_size_aligned));

constexpr auto b_lds_block_desc = Policy::template MakeBLdsBlockDescriptor<Problem>();

auto b_lds_block = make_tensor_view<address_space_enum::lds>(p_b_lds, b_lds_block_desc);

// A DRAM tile window for load
auto a_copy_dram_window =
make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(),
make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
a_dram_block_window_tmp.get_window_origin(),
Policy::template MakeADramTileDistribution<Problem>());

// A LDS tile window for store
auto a_copy_lds_window =
make_tile_window(a_lds_block,
make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
{0, 0},
a_copy_dram_window.get_tile_distribution());

// B DRAM tile window for load
auto b_copy_dram_window =
make_tile_window(b_dram_block_window_tmp.get_bottom_tensor_view(),
make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
b_dram_block_window_tmp.get_window_origin(),
Policy::template MakeBDramTileDistribution<Problem>());

// B LDS tile window for store
auto b_copy_lds_window =
make_tile_window(b_lds_block,
make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
{0, 0},
b_copy_dram_window.get_tile_distribution());

// A LDS tile for block GEMM
auto a_lds_gemm_window = make_tile_window(
a_lds_block, make_tuple(number<MPerBlock>{}, number<KPerBlock>{}), {0, 0});

// B LDS tile for block GEMM
auto b_lds_gemm_window = make_tile_window(
b_lds_block, make_tuple(number<NPerBlock>{}, number<KPerBlock>{}), {0, 0});

// Block GEMM
auto block_gemm = BlockGemm();

// Acc register tile
auto c_block_tile = decltype(block_gemm(a_lds_gemm_window, b_lds_gemm_window)){};

using ABlockTileDistr = decltype(a_copy_dram_window.get_tile_distribution());
using BBlockTileDistr = decltype(b_copy_dram_window.get_tile_distribution());

using ABlockTile = decltype(make_static_distributed_tensor<ADataType>(ABlockTileDistr{}));
using BBlockTile = decltype(make_static_distributed_tensor<BDataType>(BBlockTileDistr{}));

ABlockTile a_block_tile;
BBlockTile b_block_tile;
using ADramTileWindowStep = typename ADramBlockWindowTmp::BottomTensorIndex;
using BDramTileWindowStep = typename BDramBlockWindowTmp::BottomTensorIndex;
constexpr ADramTileWindowStep a_dram_tile_window_step = make_array(0, KPerBlock);
constexpr BDramTileWindowStep b_dram_tile_window_step = make_array(0, KPerBlock);

// -------------------------------------------------------------------------------------
// Gemm pipeline start

// Initialize C
tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
// non-prefetch
index_t iCounter = num_loop;

while(iCounter > 0)
{
a_block_tile = load_tile(a_copy_dram_window); // from DRAM to registers
b_block_tile = load_tile(b_copy_dram_window); // from DRAM to registers
move_tile_window(a_copy_dram_window, a_dram_tile_window_step);
move_tile_window(b_copy_dram_window, b_dram_tile_window_step);
store_tile(a_copy_lds_window, a_block_tile); // from registers to LDS
store_tile(b_copy_lds_window, b_block_tile); // from registers to LDS

block_sync_lds();
block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); // from LDS to registers
block_sync_lds();

iCounter--;
}

return c_block_tile;
}
};

} // namespace ck_tile
Loading
Loading