ROCm · AviralGoelAMD · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025
@@ -678,6 +678,12 @@ if(NOT GPU_ARCHS AND USER_GPU_TARGETS AND NOT MIOPEN_REQ_LIBS_ONLY)
         PACKAGE_NAME examples
    )
    add_subdirectory(example)
+
+   add_subdirectory(tutorial)
+   rocm_package_setup_component(tutorials
+        LIBRARY_NAME composablekernel
+        PACKAGE_NAME tutorials
+   )
    add_subdirectory(tile_engine)
    if(BUILD_TESTING)
        add_subdirectory(test)

@@ -25,6 +25,5 @@ add_subdirectory(22_gemm_multi_abd)
 add_subdirectory(35_batched_transpose)
 add_subdirectory(36_pooling)
 add_subdirectory(38_block_scale_gemm)
-add_subdirectory(39_copy)
 add_subdirectory(40_streamk_gemm)
 add_subdirectory(41_batched_contraction)
@@ -0,0 +1,15 @@
+include_directories(BEFORE
+    ${PROJECT_SOURCE_DIR}/include
+    ${PROJECT_SOURCE_DIR}/library/include
+)
+
+message(STATUS "Building tutorials...")
+add_custom_target(tutorials)
+
+# add all tutorial subdir
+file(GLOB dir_list LIST_DIRECTORIES true *)
+FOREACH(subdir ${dir_list})
+    if(IS_DIRECTORY "${subdir}" AND EXISTS "${subdir}/CMakeLists.txt")
+        add_subdirectory(${subdir})
+    ENDIF()
+ENDFOREACH()
@@ -1,7 +1,9 @@
-add_executable(tile_example_copy EXCLUDE_FROM_ALL copy_basic.cpp)
+add_executable(tile_tutorial_copy_kernel EXCLUDE_FROM_ALL copy_basic.cpp)
 
 # Impact: This flag ensures that the compiler doesn't make 
 # assumptions about memory aliasing that could interfere with Composable Kernel's explicit memory access patterns.
-target_compile_options(tile_example_copy PRIVATE
+target_compile_options(tile_tutorial_copy_kernel PRIVATE
   -mllvm -enable-noalias-to-md-conversion=0
 )
+
+add_dependencies(tutorials tile_tutorial_copy_kernel)
@@ -4,7 +4,7 @@
 
 set -euo pipefail
 
-BIN="${BIN:-../../../build/bin/tile_example_copy}"
+BIN="${BIN:-../../../build/bin/tile_tutorial_copy_kernel}"
 WARMUP="${WARMUP:-20}"
 REPEAT="${REPEAT:-100}"
 VALIDATE="${VALIDATE:-1}"

@@ -0,0 +1,7 @@
+add_executable(tile_tutorial_naive_gemm EXCLUDE_FROM_ALL practice_gemm.cpp)
+
+target_compile_options(tile_tutorial_naive_gemm PRIVATE
+  -mllvm -enable-noalias-to-md-conversion=0
+)
+
+add_dependencies(tutorials tile_tutorial_naive_gemm)
@@ -0,0 +1,152 @@
+# CK Tile Practice GEMM Example
+
+This is a practice implementation of a GEMM (General Matrix Multiplication) kernel using the CK Tile API. It demonstrates the fundamental concepts of GPU kernel development using CK Tile's hierarchical tile system.
+
+## CK Tile API Structure
+
+In the composable_kernel library's ck_tile API, **A Kernel is composed of a Problem, a Policy and an Epilogue**:
+
+1. **Problem** describes the shape, data type, data layout, precision of our GEMM matrices
+2. **Policy** describes how the data in the matrix (or tile) is mapped to the threads
+3. **Epilogue** describes additional computation work performed after the gemm computations (this example does not have an epilogue)
+
+## Overview
+
+This example implements a complete GEMM kernel `C = A × B` using the CK Tile framework, showcasing:
+
+- **Problem Setup** - Setting up the problem (input/output shapes, data types, mathematical operations), composing a kernel (pipeline, policy, epilogue), kernel launch
+- **Block-level Pipelining** - creating tensor views, dispatching to block-level GEMM
+- **Block-level GEMM Computation** - Block tiles, tile window creation, loading/storing to DRAM and Register memory
+- **Warp-level GEMM Computation** - Warp tiles, MFMA level computation
+
+## Problem Setup and Data Flow
+
+### Problem Size Configuration
+We set the problem size using the M, N and K variables:
+```cpp
+ck_tile::index_t M = 1024;   // Number of rows in A and C
+ck_tile::index_t N = 512;  // Number of columns in B and C
+ck_tile::index_t K = 256;  // Number of columns in A, rows in B
+```
+
+### Host Matrix Creation
+Three host matrices A (M×K), B (N×K) and C (M×N) are created, initialized on the CPU and copied over to the GPU global/DRAM memory:
+```cpp
+// Host tensors with proper strides
+ck_tile::HostTensor<ADataType> a_host(a_lengths, a_strides);  // M × K
+ck_tile::HostTensor<BDataType> b_host(b_lengths, b_strides);  // N × K
+ck_tile::HostTensor<CDataType> c_host(c_lengths, c_strides);  // M × N
+
+// Initialize with random data
+ck_tile::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_host);
+ck_tile::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_host);
+
+// Allocate device memory and transfer data
+ck_tile::DeviceMem a_device(a_host);
+a_device.ToDevice(a_host.data());
+```
+
+### PracticeGemmShape Configuration
+A PracticeGemmShape struct holds the dimension of each BlockTile and WaveTile:
+![Tile Hierarchy](./images/tile_hierarchy.png)
+
+```cpp
+using BlockTile = ck_tile::sequence<256, 128, 32>;  // M, N, K per block
+using WaveTile  = ck_tile::sequence<16, 16, 16>;   // M, N, K per wave
+```
+- A BlockTile of size MxK (256x32) on A matrix and NxK (128x32) on B matrix. A WaveTile of size MxN (16x16) on C matrix.
+
+![Tile Coverage](./images/gemm_loop.png)
+
+- BlockTiles iterate in K dimension to fetch data required for computing region of C covered by C's block tile.
+- BlockTiles are further subdivided into WarpTiles.
+- WarpTiles over A and B similarly work together to calculate the WarpTile of C.
+
+### Problem and Policy Composition
+```cpp
+// A Problem is composed from Shape and info about the data
+using PracticeGemmHostProblem = ck_tile::
+    PracticeGemmHostProblem<ADataType, BDataType, CDataType, AccDataType, PracticeGemmShape>;
+
+// A Policy is created describing data-to-thread mapping
+using PracticeGemmHostPolicy = ck_tile::PracticeGemmHostPolicy;
+
+// A Kernel is then composed of Problem and Policy
+using gemm_kernel = ck_tile::PracticeGemmKernel<PracticeGemmHostProblem, PracticeGemmHostPolicy>;
+```
+
+### Kernel Launch
+`ck_tile::launch_kernel()` is used to launch the kernel on device. It calls the `operator()` function of `PracticeGemmKernel{}`:
+```cpp
+float ave_time = ck_tile::launch_kernel(
+    ck_tile::stream_config{nullptr, true, 0, 0, 1},
+    ck_tile::make_kernel<kBlockSize, kBlockPerCU>(
+        gemm_kernel{},  // Kernel composed of Problem + Policy
+        kGridSize,      // Grid dimensions
+        kBlockSize,     // Block dimensions
+        0,              // Dynamic shared memory
+        // Kernel arguments: device buffers and problem dimensions
+        a_device.GetDeviceBuffer(), b_device.GetDeviceBuffer(), c_device.GetDeviceBuffer(),
+        M, N, K, stride_a, stride_b, stride_c));
+```
+
+### Result Verification
+The results from the kernel are compared with results from CPU based computation function:
+```cpp
+// CPU reference implementation
+ck_tile::HostTensor<CDataType> c_host_ref(c_lengths, c_strides);
+reference_basic_gemm<ADataType, BDataType, AccDataType, CDataType>(a_host, b_host, c_host_ref);
+
+// Device results
+ck_tile::HostTensor<CDataType> c_host_dev(c_lengths, c_strides);
+
+// Verify correctness
+bool pass = ck_tile::check_err(c_host_dev, c_host_ref);
+```
+
+### Runtime Flow
+
+The main program (`practice_gemm.cpp`) is the entry point for the runtime flow:
+
+```cpp
+int main()
+{
+    // 1. Define data types and problem sizes
+    using ADataType = ck_tile::half_t;
+    ck_tile::index_t M = 2048, N = 1024, K = 512;
+
+    // 2. Create host tensors and initialize
+    ck_tile::HostTensor<ADataType> a_host(a_lengths, a_strides);
+    ck_tile::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_host);
+
+    // 3. Allocate device memory and transfer data
+    ck_tile::DeviceMem a_device(a_host);
+
+    // 4. Configure tile shapes
+    using BlockTile = ck_tile::sequence<256, 128, 32>;
+    using WaveTile  = ck_tile::sequence<16, 16, 16>;
+
+    // 5. Launch kernel
+    using gemm_kernel = ck_tile::PracticeGemmKernel<Problem, Policy>;
+    float ave_time = ck_tile::launch_kernel(/*...*/);
+
+    // 6. Verify results
+    bool pass = verify_results(a_host, b_host, c_host);
+
+    // 7. Print performance metrics
+    print_performance_metrics(ave_time, M, N, K);
+}
+```
+
+## Building and Running
+
+```bash
+# From composable_kernel root directory
+mkdir build && cd build
+sh ../script/cmake-ck-dev.sh ../ <arch>
+make tile_example_practice_gemm -j
+
+# Run with sample sizes
+./bin/tile_example_practice_gemm
+```
+This example serves as a foundation for understanding more complex GEMM implementations and optimization strategies in the CK Tile framework.
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+
+namespace ck_tile {
+
+template <typename Problem, typename Policy = PracticeGemmBlockPolicy>
+struct PracticeGemmBlockPipelineAGmemBGmemCreg
+{
+    using ADataType   = typename Problem::ADataType;
+    using BDataType   = typename Problem::BDataType;
+    using CDataType   = typename Problem::CDataType;
+    using AccDataType = typename Problem::AccDataType;
+
+    using BlockTile = typename Problem::Shape::BlockTile;
+    using WaveTile  = typename Problem::Shape::WaveTile;
+
+    static constexpr index_t MPerBlock = BlockTile::at(number<0>{});
+    static constexpr index_t NPerBlock = BlockTile::at(number<1>{});
+    static constexpr index_t KPerBlock = BlockTile::at(number<2>{});
+
+    static constexpr index_t MPerWave = WaveTile::at(number<0>{});
+    static constexpr index_t NPerWave = WaveTile::at(number<1>{});
+    static constexpr index_t KPerWave = WaveTile::at(number<2>{});
+
+    using BlockGemm =
+        remove_cvref_t<decltype(Policy::template GetPracticeWaveGemmPipeline<Problem>())>;
+
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetStaticLDSSize()
+    {
+        return integer_divide_ceil(
+                   sizeof(ADataType) *
+                       Policy::template MakeALdsBlockDescriptor<Problem>().get_element_space_size(),
+                   16) *
+                   16 +
+               sizeof(BDataType) *
+                   Policy::template MakeBLdsBlockDescriptor<Problem>().get_element_space_size();
+    }
+
+    template <typename ADramBlockWindowTmp, typename BDramBlockWindowTmp>
+    CK_TILE_HOST_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                        const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                        index_t num_loop,
+                                        void* p_smem) const
+    {
+        static_assert(
+            std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
+                std::is_same_v<BDataType, remove_cvref_t<typename BDramBlockWindowTmp::DataType>>,
+            "wrong!");
+
+        static_assert(MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          NPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<1>{}],
+                      "wrong!");
+
+        // -----------------------------------------------------------------------------------------
+        // Definitions of all needed tiles
+
+        // A tile in LDS
+        ADataType* p_a_lds = static_cast<ADataType*>(p_smem);
+
+        constexpr auto a_lds_block_desc = Policy::template MakeALdsBlockDescriptor<Problem>();
+
+        auto a_lds_block = make_tensor_view<address_space_enum::lds>(p_a_lds, a_lds_block_desc);
+
+        constexpr index_t a_lds_block_space_size_aligned =
+            integer_divide_ceil(sizeof(ADataType) * a_lds_block_desc.get_element_space_size(), 16) *
+            16;
+
+        // B tile in LDS
+        BDataType* p_b_lds = static_cast<BDataType*>(
+            static_cast<void*>(static_cast<char*>(p_smem) + a_lds_block_space_size_aligned));
+
+        constexpr auto b_lds_block_desc = Policy::template MakeBLdsBlockDescriptor<Problem>();
+
+        auto b_lds_block = make_tensor_view<address_space_enum::lds>(p_b_lds, b_lds_block_desc);
+
+        // A DRAM tile window for load
+        auto a_copy_dram_window =
+            make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(),
+                             make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
+                             a_dram_block_window_tmp.get_window_origin(),
+                             Policy::template MakeADramTileDistribution<Problem>());
+
+        // A LDS tile window for store
+        auto a_copy_lds_window =
+            make_tile_window(a_lds_block,
+                             make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
+                             {0, 0},
+                             a_copy_dram_window.get_tile_distribution());
+
+        // B DRAM tile window for load
+        auto b_copy_dram_window =
+            make_tile_window(b_dram_block_window_tmp.get_bottom_tensor_view(),
+                             make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
+                             b_dram_block_window_tmp.get_window_origin(),
+                             Policy::template MakeBDramTileDistribution<Problem>());
+
+        // B LDS tile window for store
+        auto b_copy_lds_window =
+            make_tile_window(b_lds_block,
+                             make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
+                             {0, 0},
+                             b_copy_dram_window.get_tile_distribution());
+
+        // A LDS tile for block GEMM
+        auto a_lds_gemm_window = make_tile_window(
+            a_lds_block, make_tuple(number<MPerBlock>{}, number<KPerBlock>{}), {0, 0});
+
+        // B LDS tile for block GEMM
+        auto b_lds_gemm_window = make_tile_window(
+            b_lds_block, make_tuple(number<NPerBlock>{}, number<KPerBlock>{}), {0, 0});
+
+        // Block GEMM
+        auto block_gemm = BlockGemm();
+
+        // Acc register tile
+        auto c_block_tile = decltype(block_gemm(a_lds_gemm_window, b_lds_gemm_window)){};
+
+        using ABlockTileDistr = decltype(a_copy_dram_window.get_tile_distribution());
+        using BBlockTileDistr = decltype(b_copy_dram_window.get_tile_distribution());
+
+        using ABlockTile = decltype(make_static_distributed_tensor<ADataType>(ABlockTileDistr{}));
+        using BBlockTile = decltype(make_static_distributed_tensor<BDataType>(BBlockTileDistr{}));
+
+        ABlockTile a_block_tile;
+        BBlockTile b_block_tile;
+        using ADramTileWindowStep = typename ADramBlockWindowTmp::BottomTensorIndex;
+        using BDramTileWindowStep = typename BDramBlockWindowTmp::BottomTensorIndex;
+        constexpr ADramTileWindowStep a_dram_tile_window_step = make_array(0, KPerBlock);
+        constexpr BDramTileWindowStep b_dram_tile_window_step = make_array(0, KPerBlock);
+
+        // -------------------------------------------------------------------------------------
+        // Gemm pipeline start
+
+        // Initialize C
+        tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
+        // non-prefetch
+        index_t iCounter = num_loop;
+
+        while(iCounter > 0)
+        {
+            a_block_tile = load_tile(a_copy_dram_window); // from DRAM to registers
+            b_block_tile = load_tile(b_copy_dram_window); // from DRAM to registers
+            move_tile_window(a_copy_dram_window, a_dram_tile_window_step);
+            move_tile_window(b_copy_dram_window, b_dram_tile_window_step);
+            store_tile(a_copy_lds_window, a_block_tile); // from registers to LDS
+            store_tile(b_copy_lds_window, b_block_tile); // from registers to LDS
+
+            block_sync_lds();
+            block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); // from LDS to registers
+            block_sync_lds();
+
+            iCounter--;
+        }
+
+        return c_block_tile;
+    }
+};
+
+} // namespace ck_tile