diff --git a/example/ck_tile/15_fused_moe/main.cpp b/example/ck_tile/15_fused_moe/main.cpp index ac174379df..efb83efbd2 100644 --- a/example/ck_tile/15_fused_moe/main.cpp +++ b/example/ck_tile/15_fused_moe/main.cpp @@ -284,26 +284,25 @@ bool run(const ck_tile::ArgParser& arg_parser) } else if(init == 1) { - ck_tile::FillUniformDistribution{-.5f, .5f, seed, true}(a_host); - ck_tile::FillUniformDistribution{-.5f, .5f, seed, true}(g_host); - ck_tile::FillUniformDistribution{-.5f, .5f, seed, true}(d_host); - ck_tile::FillUniformDistribution{-.5f, .5f, seed, true}(sa_host); - ck_tile::FillUniformDistribution{-.5f, .5f, seed, true}(sg_host); - ck_tile::FillUniformDistribution{-.5f, .5f, seed, true}(sd_host); - ck_tile::FillUniformDistribution{-.5f, .5f, seed, true}(sy_host); - ck_tile::FillUniformDistribution{-.5f, .5f, seed, true}( - topk_weight_host); + ck_tile::FillUniformDistribution{-.5f, .5f, seed}(a_host); + ck_tile::FillUniformDistribution{-.5f, .5f, seed}(g_host); + ck_tile::FillUniformDistribution{-.5f, .5f, seed}(d_host); + ck_tile::FillUniformDistribution{-.5f, .5f, seed}(sa_host); + ck_tile::FillUniformDistribution{-.5f, .5f, seed}(sg_host); + ck_tile::FillUniformDistribution{-.5f, .5f, seed}(sd_host); + ck_tile::FillUniformDistribution{-.5f, .5f, seed}(sy_host); + ck_tile::FillUniformDistribution{-.5f, .5f, seed}(topk_weight_host); } else if(init == 2) { - ck_tile::FillNormalDistribution{0.f, 1.f, seed, true}(a_host); - ck_tile::FillNormalDistribution{0.f, 1.f, seed, true}(g_host); - ck_tile::FillNormalDistribution{0.f, 1.f, seed, true}(d_host); - ck_tile::FillNormalDistribution{0.f, 1.f, seed, true}(sa_host); - ck_tile::FillNormalDistribution{0.f, 1.f, seed, true}(sg_host); - ck_tile::FillNormalDistribution{0.f, 1.f, seed, true}(sd_host); - ck_tile::FillNormalDistribution{0.f, 1.f, seed, true}(sy_host); - ck_tile::FillNormalDistribution{0.f, 1.f, seed, true}(topk_weight_host); + ck_tile::FillNormalDistribution{0.f, 1.f, seed}(a_host); + ck_tile::FillNormalDistribution{0.f, 1.f, seed}(g_host); + ck_tile::FillNormalDistribution{0.f, 1.f, seed}(d_host); + ck_tile::FillNormalDistribution{0.f, 1.f, seed}(sa_host); + ck_tile::FillNormalDistribution{0.f, 1.f, seed}(sg_host); + ck_tile::FillNormalDistribution{0.f, 1.f, seed}(sd_host); + ck_tile::FillNormalDistribution{0.f, 1.f, seed}(sy_host); + ck_tile::FillNormalDistribution{0.f, 1.f, seed}(topk_weight_host); } // permute weight diff --git a/example/ck_tile/18_flatmm/mxgemm/run_mx_flatmm.inc b/example/ck_tile/18_flatmm/mxgemm/run_mx_flatmm.inc index 44fd12e2d9..cc2c041ed6 100644 --- a/example/ck_tile/18_flatmm/mxgemm/run_mx_flatmm.inc +++ b/example/ck_tile/18_flatmm/mxgemm/run_mx_flatmm.inc @@ -71,17 +71,17 @@ int run_mx_flatmm_with_layouts(int argc, if(init_method == 0) { - ck_tile::FillUniformDistribution{0.0f, 1.0f}(a_host); - ck_tile::FillUniformDistribution{-.5f, .5f}(b_origin_host); - ck_tile::FillUniformDistribution{-2.f, 2.f}(scale_a); - ck_tile::FillUniformDistribution{-2.f, 2.f}(scale_b); + ck_tile::FillUniformDistribution<>{0.0f, 1.0f}(a_host); + ck_tile::FillUniformDistribution<>{-.5f, .5f}(b_origin_host); + ck_tile::FillUniformDistribution<>{-2.f, 2.f}(scale_a); + ck_tile::FillUniformDistribution<>{-2.f, 2.f}(scale_b); } else if(init_method == 1) { - ck_tile::FillUniformDistribution{1.f, 1.f}(a_host); - ck_tile::FillUniformDistribution{1.f, 1.f}(b_origin_host); - ck_tile::FillUniformDistribution{1.f, 1.f}(scale_a); - ck_tile::FillUniformDistribution{1.f, 1.f}(scale_b); + ck_tile::FillUniformDistribution<>{1.f, 1.f}(a_host); + ck_tile::FillUniformDistribution<>{1.f, 1.f}(b_origin_host); + ck_tile::FillUniformDistribution<>{1.f, 1.f}(scale_a); + ck_tile::FillUniformDistribution<>{1.f, 1.f}(scale_b); } else { diff --git a/include/ck_tile/host/fill.hpp b/include/ck_tile/host/fill.hpp index 12f43ebc5e..4bbf8cbf3f 100644 --- a/include/ck_tile/host/fill.hpp +++ b/include/ck_tile/host/fill.hpp @@ -33,59 +33,73 @@ namespace ck_tile { * @example * * // Direct usage without creating a separate variable: - * ck_tile::FillUniformDistribution{-1.f, 1.f}(a_host_tensor); + * ck_tile::FillUniformDistribution<>{-1.f, 1.f}(a_host_tensor); */ -template +template struct FillUniformDistribution { float a_{-5.f}; float b_{5.f}; std::optional seed_{11939}; - // ATTENTION: Whether to use multi-threading (note: not guaranteed to be perfectly distributed - // across threads). - bool threaded = false; template void operator()(ForwardIter first, ForwardIter last) const { - if(threaded) + if(first == last) + return; + using T_iter = std::decay_t; + static_assert(std::is_same_v || std::is_void_v, + "Iterator value type must match template type T"); + constexpr auto PackedSize = numeric_traits::PackedSize; + const auto total = static_cast(std::distance(first, last)); + const auto total_bytes = total * sizeof(T_iter); + + // max 80 threads; at least 2MB per thread + const size_t available_cpu_cores = get_available_cpu_cores(); + const size_t num_thread = + min(80UL, available_cpu_cores, integer_divide_ceil(total_bytes, 0x200000UL)); + constexpr size_t BLOCK_BYTES = 64; + constexpr size_t BLOCK_SIZE = BLOCK_BYTES / sizeof(T_iter); + const size_t num_blocks = integer_divide_ceil(total_bytes, BLOCK_BYTES); + const size_t blocks_per_thread = integer_divide_ceil(num_blocks, num_thread); + + // use minstd_rand for better performance on discard() + std::minstd_rand gen(seed_.has_value() ? *seed_ : std::random_device{}()); + std::uniform_real_distribution dis(a_, b_); + + std::vector threads; + threads.reserve(num_thread - 1); // last job run in the main thread + for(int it = num_thread - 1; it >= 0; --it) { - uint32_t num_thread = std::thread::hardware_concurrency(); - auto total = static_cast(std::distance(first, last)); - auto work_per_thread = static_cast((total + num_thread - 1) / num_thread); + const size_t ib_begin = it * blocks_per_thread; + const size_t ib_end = min(ib_begin + blocks_per_thread, num_blocks); + + auto job = [=]() { + auto g_ = gen; // copy + auto d_ = dis; // copy + g_.discard(ib_begin * BLOCK_SIZE * PackedSize); + auto t_fn = [&]() { + if constexpr(PackedSize == 2) + return type_convert(fp32x2_t{d_(g_), d_(g_)}); + else + return type_convert(d_(g_)); + }; - std::vector threads(num_thread); - for(std::size_t it = 0; it < num_thread; ++it) - { - std::size_t iw_begin = it * work_per_thread; - std::size_t iw_end = std::min((it + 1) * work_per_thread, total); - auto thread_f = [this, total, iw_begin, iw_end, &first] { - if(iw_begin > total || iw_end > total) - return; - // need to make each thread unique, add an offset to current seed - std::mt19937 gen(seed_.has_value() ? (*seed_ + iw_begin) - : std::random_device{}()); - std::uniform_real_distribution dis(a_, b_); - std::generate(first + iw_begin, first + iw_end, [&dis, &gen]() { - if constexpr(numeric_traits::PackedSize == 2) - return ck_tile::type_convert(fp32x2_t{dis(gen), dis(gen)}); - else - return ck_tile::type_convert(dis(gen)); + size_t ib = ib_begin; + for(; ib < ib_end - 1; ++ib) // full blocks + static_for<0, BLOCK_SIZE, 1>{}([&](auto iw_) { + constexpr size_t iw = iw_.value; + *(first + ib * BLOCK_SIZE + iw) = t_fn(); }); - }; - threads[it] = joinable_thread(thread_f); - } - } - else - { - std::mt19937 gen(seed_.has_value() ? *seed_ : std::random_device{}()); - std::uniform_real_distribution dis(a_, b_); - std::generate(first, last, [&dis, &gen]() { - if constexpr(numeric_traits::PackedSize == 2) - return ck_tile::type_convert(fp32x2_t{dis(gen), dis(gen)}); - else - return ck_tile::type_convert(dis(gen)); - }); + for(size_t iw = 0; iw < BLOCK_SIZE; ++iw) // last block + if(ib * BLOCK_SIZE + iw < total) + *(first + ib * BLOCK_SIZE + iw) = t_fn(); + }; + + if(it > 0) + threads.emplace_back(std::move(job)); + else + job(); // last job run in the main thread } } diff --git a/include/ck_tile/host/joinable_thread.hpp b/include/ck_tile/host/joinable_thread.hpp index bf84858ee2..b2e1fc4dac 100644 --- a/include/ck_tile/host/joinable_thread.hpp +++ b/include/ck_tile/host/joinable_thread.hpp @@ -3,6 +3,9 @@ #pragma once +#ifdef __linux__ +#include +#endif #include #include @@ -24,4 +27,50 @@ struct joinable_thread : std::thread this->join(); } }; + +inline unsigned int get_available_cpu_cores() +{ +#if defined(__linux__) + cpu_set_t cpu_set; + if(sched_getaffinity(0, sizeof(cpu_set_t), &cpu_set) == 0) + { + unsigned int cpu_count = CPU_COUNT(&cpu_set); + if(cpu_count > 0) + return cpu_count; + } +#endif + // Fallback if sched_getaffinity unavailable or fails + return std::thread::hardware_concurrency(); +} + +class cpu_core_guard +{ +#if defined(__linux__) + cpu_set_t original_cpu_set_; + + public: + cpu_core_guard(unsigned int num_cores) : original_cpu_set_() + { + // save original cpu set + sched_getaffinity(0, sizeof(cpu_set_t), &original_cpu_set_); + + // set new cpu set + cpu_set_t new_cpu_set; + CPU_ZERO(&new_cpu_set); + for(unsigned int i = 0; i < num_cores; ++i) + { +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wold-style-cast" + CPU_SET(i, &new_cpu_set); // NOLINT(old-style-cast) +#pragma clang diagnostic pop + } + sched_setaffinity(0, sizeof(cpu_set_t), &new_cpu_set); + } + ~cpu_core_guard() + { + // restore original cpu set + sched_setaffinity(0, sizeof(cpu_set_t), &original_cpu_set_); + } +#endif +}; } // namespace ck_tile diff --git a/test/ck_tile/utility/CMakeLists.txt b/test/ck_tile/utility/CMakeLists.txt index aa15293411..01ed83841b 100644 --- a/test/ck_tile/utility/CMakeLists.txt +++ b/test/ck_tile/utility/CMakeLists.txt @@ -3,5 +3,7 @@ message("-- Adding: test/ck_tile/utility/") +add_gtest_executable(test_fill test_fill.cpp) + # Add print tests add_subdirectory(print) diff --git a/test/ck_tile/utility/test_fill.cpp b/test/ck_tile/utility/test_fill.cpp new file mode 100644 index 0000000000..18f42c4ad0 --- /dev/null +++ b/test/ck_tile/utility/test_fill.cpp @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck_tile/host/fill.hpp" +#include "ck_tile/host/joinable_thread.hpp" +#include +#include +#include +#include + +using namespace ck_tile; + +namespace test { + +// Test fixture for FillUniformDistribution tests +template +class FillUniformDistributionTest : public ::testing::Test +{ + public: + static constexpr uint32_t seed = 42; + static constexpr float a = -5.0f; + static constexpr float b = 5.0f; +}; + +using TestTypes = ::testing::Types; +TYPED_TEST_SUITE(FillUniformDistributionTest, TestTypes); + +// Test that multiple runs with the same seed produce identical results +TYPED_TEST(FillUniformDistributionTest, ConsistencyWithSameSeed) +{ + using T = TypeParam; + const auto a = this->a; + const auto b = this->b; + const auto seed = this->seed; + + constexpr size_t size = 1024 * 1024 * 1024 / sizeof(T); // 1G + + std::vector vec1(size); + auto start = std::chrono::high_resolution_clock::now(); + FillUniformDistribution{a, b, seed}(vec1.begin(), vec1.end()); + auto end = std::chrono::high_resolution_clock::now(); + double sec = std::chrono::duration(end - start).count(); + std::cout << "Taking " << sec << " sec to fill 1GB of data of type " << typeid(T).name() + << std::endl; + + const auto cpu_cores = max(32U, get_available_cpu_cores()); + for(auto num_threads_diff : {-3, -1}) + { + cpu_core_guard cg(min(max(cpu_cores + num_threads_diff, 1U), get_available_cpu_cores())); + std::vector vec2(size); + FillUniformDistribution{a, b, seed}(vec2.begin(), vec2.end()); + EXPECT_EQ(0, std::memcmp(vec1.data(), vec2.data(), size * sizeof(T))) + << "First and second fill should be identical"; + } +} + +// Test consistency across different data sizes (which affects threading) +TYPED_TEST(FillUniformDistributionTest, ConsistencyAcrossSizes) +{ + using T = TypeParam; + const auto a = this->a; + const auto b = this->b; + const auto seed = this->seed; + + std::vector test_sizes = { + 100, // Small - likely single threaded + 10000, // Medium + 1000000, // Large - will use multiple threads + 5000000 // Very large - will use many threads + }; + + for(size_t size : test_sizes) + { + std::vector reference(size); + std::vector test_vec(size); + + FillUniformDistribution{a, b, seed}(reference.begin(), reference.end()); + + // Run multiple times to ensure consistency + for(int run = 0; run < 3; ++run) + { + std::fill(test_vec.begin(), test_vec.end(), T{}); + FillUniformDistribution{a, b, seed}(test_vec.begin(), test_vec.end()); + + EXPECT_EQ(0, std::memcmp(reference.data(), test_vec.data(), size * sizeof(T))) + << "Mismatch for size=" << size << " run=" << run; + } + } +} + +// Test that different seeds produce different results +TYPED_TEST(FillUniformDistributionTest, CommonPrefix) +{ + using T = TypeParam; + const auto a = this->a; + const auto b = this->b; + const auto seed = this->seed; + + std::vector test_sizes = { + 100, // Small - likely single threaded + 10000, // Medium + 1000000, // Large - will use multiple threads + 5000000 // Very large - will use many threads + }; + + auto longest = std::make_unique>(test_sizes[0]); + FillUniformDistribution{a, b, seed}(longest->begin(), longest->end()); + for(size_t i = 1; i < test_sizes.size(); ++i) + { + auto current = std::make_unique>(test_sizes[i]); + FillUniformDistribution{a, b, seed}(current->begin(), current->end()); + size_t min_size = std::min(longest->size(), current->size()); + EXPECT_EQ(0, std::memcmp(longest->data(), current->data(), min_size * sizeof(T))) + << "Different sizes with same seed should have the same prefix"; + if(current->size() > longest->size()) + { + longest = std::move(current); + } + } +} + +// Test edge cases +TYPED_TEST(FillUniformDistributionTest, EdgeCases) +{ + using T = TypeParam; + const auto a = this->a; + const auto b = this->b; + const auto seed = this->seed; + + // Empty range + std::vector empty_vec; + EXPECT_NO_THROW((FillUniformDistribution{a, b, seed}(empty_vec.begin(), empty_vec.end()))); + + // Single element + std::vector single1(1); + std::vector single2(1); + FillUniformDistribution{a, b, seed}(single1.begin(), single1.end()); + FillUniformDistribution{a, b, seed}(single2.begin(), single2.end()); + + EXPECT_EQ(0, std::memcmp(single1.data(), single2.data(), sizeof(T))) + << "Single element should be consistent"; + + // Small sizes that might affect threading decisions + std::vector small_sizes = {2, 3, 7, 15, 16, 17, 31, 32, 33, 63, 64, 65}; + for(size_t size : small_sizes) + { + std::vector vec1(size); + std::vector vec2(size); + FillUniformDistribution{a, b, seed}(vec1.begin(), vec1.end()); + FillUniformDistribution{a, b, seed}(vec2.begin(), vec2.end()); + + EXPECT_EQ(0, std::memcmp(vec1.data(), vec2.data(), size * sizeof(T))) + << "Edge case failed for size=" << size; + } +} +} // namespace test