From b27ea94ad2cffac2a1660dedf5cf3b08bc1ae41f Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Mon, 5 Aug 2024 08:19:47 +0000 Subject: [PATCH 01/69] initial push --- requirements.txt | 2 +- ...ip_implicit_gemm_3d_grouped_fwd_xdlops.cpp | 112 +--- ...ip_implicit_gemm_3d_grouped_fwd_xdlops.cpp | 500 ++++++++++++++++++ 3 files changed, 505 insertions(+), 109 deletions(-) create mode 100644 src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp diff --git a/requirements.txt b/requirements.txt index ee6d1c718c..5e19e14afc 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@d39c3f5d5e95f3e2fd85b730b9f25fe91fdb7c85 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON +ROCm/composable_kernel@3e9711f0cb1c7ffd3826a93dfa6dd65e98715636 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON google/googletest@v1.14.0 diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp index 4b3a2f8ed2..183b7045c1 100644 --- a/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp @@ -36,6 +36,8 @@ #include #include #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp" +#include "ck/tensor_operation/gpu/device/matrix_padder.hpp" +#include "ck/tensor_operation/gpu/device/helper.hpp" #endif #include MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS) @@ -341,46 +343,6 @@ struct CKArgs } // namespace -template -void PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::Init(const ProblemDescription& problem) -{ - switch(problem.GetAlphaBetaCase()) - { - case BILINEAR: - valid_kernels = - FillValidKernelsIDs, CKArgs>(problem); - break; - case SCALE: - valid_kernels = - FillValidKernelsIDs, CKArgs>(problem); - break; - default: - valid_kernels = - FillValidKernelsIDs, CKArgs>(problem); - break; - } - index = 0; - kernel_id = valid_kernels[index]; -} - -template -bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::CheckIsSupportCKArgs( - const ProblemDescription& problem) const -{ - switch(problem.GetAlphaBetaCase()) - { - case BILINEAR: - return IsCKArgsSupported, CKArgs>(problem, - kernel_id); - case SCALE: - return IsCKArgsSupported, CKArgs>(problem, - kernel_id); - default: - return IsCKArgsSupported, CKArgs>(problem, - kernel_id); - } -} - template bool ConvHipImplicitGemm3DGroupFwdXdlops::CheckCKApplicability( const ProblemDescription& problem) const @@ -395,72 +357,6 @@ bool ConvHipImplicitGemm3DGroupFwdXdlops::CheckCKApplicability( } #endif -void PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::HeuristicInit( - [[maybe_unused]] const ProblemDescription& problem) -{ - index = 0; - kernel_id = ""; - -#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL - switch(problem.GetInDataType()) - { - case miopenHalf: Init(problem); break; - case miopenFloat: Init(problem); break; - case miopenInt8: Init(problem); break; - case miopenBFloat16: Init(problem); break; - case miopenInt64: - case miopenInt32: - case miopenFloat8: - case miopenBFloat8: - case miopenDouble: break; - } -#endif -} - -bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::SetNextValue( - const ProblemDescription& problem) -{ - if(valid_kernels.empty()) - { - HeuristicInit(problem); - assert(!valid_kernels.empty()); - return true; - } - if((index + 1) < valid_kernels.size()) - { - ++index; - kernel_id = valid_kernels[index]; - return true; - } - else - return false; -} - -bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::IsValidValue() const -{ - return index < valid_kernels.size(); -} - -bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::IsValid( - [[maybe_unused]] const ProblemDescription& problem) const -{ -#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL - switch(problem.GetInDataType()) - { - case miopenHalf: return CheckIsSupportCKArgs(problem); - case miopenFloat: return CheckIsSupportCKArgs(problem); - case miopenInt8: return CheckIsSupportCKArgs(problem); - case miopenBFloat16: return CheckIsSupportCKArgs(problem); - case miopenInt64: - case miopenInt32: - case miopenFloat8: - case miopenBFloat8: - case miopenDouble: break; - } -#endif - return false; -} - bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::operator==( const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops& other) const { @@ -472,7 +368,7 @@ ConvHipImplicitGemm3DGroupFwdXdlops::GetDefaultPerformanceConfig( const ExecutionContext&, const ProblemDescription& problem) const { PerformanceConfigHipImplicitGemm3DGroupFwdXdlops pp; - pp.HeuristicInit(problem); + // pp.HeuristicInit(problem); return pp; } @@ -481,7 +377,7 @@ bool ConvHipImplicitGemm3DGroupFwdXdlops::IsValidPerformanceConfig( const ProblemDescription& problem, const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops& config) const { - return config.IsValid(problem); + // return config.IsValid(problem); } size_t diff --git a/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp b/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp new file mode 100644 index 0000000000..183b7045c1 --- /dev/null +++ b/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp @@ -0,0 +1,500 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include + +#include +#include +#include +#include +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL +#include +#include +#include +#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp" +#include "ck/tensor_operation/gpu/device/matrix_padder.hpp" +#include "ck/tensor_operation/gpu/device/helper.hpp" +#endif +#include +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS) + +namespace miopen { +namespace solver { +namespace conv { + +using ProblemDescription = miopen::conv::ProblemDescription; + +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL + +using InLayout = ck::tensor_layout::convolution::NDHWGC; +using WeiLayout = ck::tensor_layout::convolution::GKZYXC; +using OutLayout = ck::tensor_layout::convolution::NDHWGK; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using Bilinear = ck::tensor_operation::element_wise::Bilinear; +using Scale = ck::tensor_operation::element_wise::Scale; +static constexpr ck::index_t NumDimSpatial = 3; + +template +using DeviceOpGFwdBilinear = + ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD, + OutLayout, + DataType, + DataType, + ck::Tuple, + DataType, + PassThrough, + PassThrough, + Bilinear>; + +template +using DeviceOpGFwdScale = + ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD, + OutLayout, + DataType, + DataType, + ck::Tuple<>, + DataType, + PassThrough, + PassThrough, + Scale>; + +template +using DeviceOpGFwdDefault = + ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD, + OutLayout, + DataType, + DataType, + ck::Tuple<>, + DataType, + PassThrough, + PassThrough, + PassThrough>; + +template +using DeviceOpGFwdBilinearPtrs = + ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOpGFwdBilinear>; + +template +using DeviceOpGFwdScalePtrs = + ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOpGFwdScale>; + +template +using DeviceOpGFwdDefaultPtrs = + ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOpGFwdDefault>; + +namespace { + +template +struct CKArgs +{ + CKArgs(const ProblemDescription& problem) + { + G = ProblemInterpreter::GetGroupCountG(problem); + N = ProblemInterpreter::GetBatchN(problem); + K1 = ProblemInterpreter::GetOutputChannelK(problem); + C1 = ProblemInterpreter::GetInputChannelC(problem); + C = C1 / G; // Number of input Channel per group + K = K1 / G; // Number of output Channel per group + Hi = ProblemInterpreter::GetInputHeightHi(problem); + Wi = ProblemInterpreter::GetInputWidthWi(problem); + Ho = ProblemInterpreter::GetOutputHeightHo(problem); + Wo = ProblemInterpreter::GetOutputWidthWo(problem); + Y = ProblemInterpreter::GetFilterHeightY(problem); + X = ProblemInterpreter::GetFilterWidthX(problem); + Di = ProblemInterpreter::GetInputDepthDi(problem); + Do = ProblemInterpreter::GetOutputDepthDo(problem); + Z = ProblemInterpreter::GetFilterDepthZ(problem); + alpha_beta_case = ProblemInterpreter::GetAlphaBetaCase(problem); + + in_lengths = {G, N, C, Di, Hi, Wi}; + out_lengths = {G, N, K, Do, Ho, Wo}; + wei_lengths = {G, K, C, Z, Y, X}; + + // CK strides are in GNCDHW order + if(problem.IsLayoutNHWC()) + { + // first entry reserved for G's stride + auto copy_strides = [](const auto& src, auto& dst) { + assert(dst.size() == (src.size() + 1)); + std::copy(src.begin(), src.end(), dst.begin() + 1); + }; + copy_strides(problem.GetIn().GetStrides(), in_strides); + copy_strides(problem.GetOut().GetStrides(), out_strides); + copy_strides(problem.GetWeights().GetStrides(), wei_strides); + + // Now compute G's stride + in_strides[0] = C; + out_strides[0] = K; + wei_strides[0] = K * wei_strides[1]; + } + else + { + assert(problem.IsLayoutDefault()); // already checked in IsApplicable + // for default layout, we produce packed strides for NHWC layout + // because we transpose to NHWC layout before calling CK kernel + in_strides = {C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C}; + out_strides = {K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K}; + wei_strides = {K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C}; + } + + filter_strides = {ProblemInterpreter::GetAdjustedConvolutionStrideD(problem), + ProblemInterpreter::GetAdjustedConvolutionStrideH(problem), + ProblemInterpreter::GetAdjustedConvolutionStrideW(problem)}; + filter_dilations = {ProblemInterpreter::GetAdjustedConvolutionDilationD(problem), + ProblemInterpreter::GetAdjustedConvolutionDilationH(problem), + ProblemInterpreter::GetAdjustedConvolutionDilationW(problem)}; + lPadding = {ProblemInterpreter::GetInputLeftPadD(problem), + ProblemInterpreter::GetInputLeftPadH(problem), + ProblemInterpreter::GetInputLeftPadW(problem)}; + rPadding = {ProblemInterpreter::GetAdjustedInputRightPadD(problem), + ProblemInterpreter::GetAdjustedInputRightPadH(problem), + ProblemInterpreter::GetAdjustedInputRightPadW(problem)}; + } + + CKArgs(const CKArgs&) = default; + CKArgs(CKArgs&&) noexcept = default; + CKArgs& operator=(const CKArgs&) = default; + + template + auto MakeArgPtr(const ConvPtr& conv_ptr, + ConstData_t in, + ConstData_t w, + Data_t out, + float alpha, + float beta) const + { + using DeviceP = std::remove_pointer_t; + if constexpr(std::is_same_v>) + { + return MakeBilinearArgPtr(conv_ptr, in, w, out, alpha, beta); + } + else if constexpr(std::is_same_v>) + { + (void)beta; + return MakeScaleArgPtr(conv_ptr, in, w, out, alpha); + } + else + { + (void)alpha; + (void)beta; + static_assert(std::is_same_v>, + "Default should be fwd pass through"); + return MakeDefaultArgPtr(conv_ptr, in, w, out); + } + } + + template + auto MakeBilinearArgPtr(const ConvPtr& conv_ptr, + ConstData_t in, + ConstData_t w, + Data_t out, + float alpha, + float beta) const + { + return conv_ptr->MakeArgumentPointer(in, + w, + {out}, + out, + in_lengths, + in_strides, + wei_lengths, + wei_strides, + {out_lengths}, + {out_strides}, + out_lengths, + out_strides, + filter_strides, + filter_dilations, + lPadding, + rPadding, + PassThrough{}, + PassThrough{}, + Bilinear{alpha, beta}); + } + + template + auto MakeScaleArgPtr( + const ConvPtr& conv_ptr, ConstData_t in, ConstData_t w, Data_t out, float alpha) const + { + return conv_ptr->MakeArgumentPointer(in, + w, + {}, + out, + in_lengths, + in_strides, + wei_lengths, + wei_strides, + {}, + {}, + out_lengths, + out_strides, + filter_strides, + filter_dilations, + lPadding, + rPadding, + PassThrough{}, + PassThrough{}, + Scale{alpha}); + } + + template + auto MakeDefaultArgPtr(const ConvPtr& conv_ptr, ConstData_t in, ConstData_t w, Data_t out) const + { + return conv_ptr->MakeArgumentPointer(in, + w, + {}, + out, + in_lengths, + in_strides, + wei_lengths, + wei_strides, + {}, + {}, + out_lengths, + out_strides, + filter_strides, + filter_dilations, + lPadding, + rPadding, + PassThrough{}, + PassThrough{}, + PassThrough{}); + } + + template + auto MakeArgPtr(const ConvPtr& conv_ptr, + const ConvDataTensors& tensors, + float alpha, + float beta) const + { + return MakeArgPtr(conv_ptr, tensors.in, tensors.w, tensors.out, alpha, beta); + } + + template + bool IsSupportedBy(const ConvPtr& conv_ptr) const + { + auto arg_ptr = MakeArgPtr(conv_ptr, nullptr, nullptr, nullptr, 1.0f, 0.0f); + return conv_ptr->IsSupportedArgument(arg_ptr.get()); + } + + int G; + int N; + int K; + int C; + int C1; + int K1; + int Hi; + int Wi; + int Di; + int Ho; + int Wo; + int Do; + int Y; + int X; + int Z; + std::array in_lengths; + std::array in_strides; + std::array out_lengths; + std::array out_strides; + std::array wei_lengths; + std::array wei_strides; + std::array filter_strides; + std::array filter_dilations; + std::array lPadding; + std::array rPadding; + miopenAlphaBetaCase_t alpha_beta_case; +}; + +} // namespace + +template +bool ConvHipImplicitGemm3DGroupFwdXdlops::CheckCKApplicability( + const ProblemDescription& problem) const +{ + switch(problem.GetAlphaBetaCase()) + { + case BILINEAR: + return IsCKApplicable, CKArgs>(problem); + case SCALE: return IsCKApplicable, CKArgs>(problem); + default: return IsCKApplicable, CKArgs>(problem); + } +} +#endif + +bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::operator==( + const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops& other) const +{ + return kernel_id == other.kernel_id; +} + +PerformanceConfigHipImplicitGemm3DGroupFwdXdlops +ConvHipImplicitGemm3DGroupFwdXdlops::GetDefaultPerformanceConfig( + const ExecutionContext&, const ProblemDescription& problem) const +{ + PerformanceConfigHipImplicitGemm3DGroupFwdXdlops pp; + // pp.HeuristicInit(problem); + return pp; +} + +bool ConvHipImplicitGemm3DGroupFwdXdlops::IsValidPerformanceConfig( + const ExecutionContext&, + const ProblemDescription& problem, + const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops& config) const +{ + // return config.IsValid(problem); +} + +size_t +ConvHipImplicitGemm3DGroupFwdXdlops::GetWorkspaceSize(const ExecutionContext&, + const ProblemDescription& problem) const +{ + return GetWorkspaceSizeLayoutTransformConv(problem); +} + +PerformanceConfigHipImplicitGemm3DGroupFwdXdlops +ConvHipImplicitGemm3DGroupFwdXdlops::Search(const ExecutionContext& ctx, + const ProblemDescription& problem, + const AnyInvokeParams& invoke_ctx) const +{ + return GenericSearch(*this, ctx, problem, invoke_ctx); +} + +bool ConvHipImplicitGemm3DGroupFwdXdlops::IsApplicable( + [[maybe_unused]] const ExecutionContext& ctx, + [[maybe_unused]] const ProblemDescription& problem) const +{ +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL + if(env::disabled(MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS)) + return false; + // check if type float else return false + if(problem.GetConv().attribute.deterministic) + return false; + if(!problem.AllTensorsDimsFitIntoInt()) + return false; + if(problem.HasMixedDataTypes()) + return false; + if(!problem.IsDirectionForward()) + return false; + if(!problem.Is3d()) + return false; + if(!(problem.IsLayoutNHWC() || problem.IsLayoutDefault())) + return false; + // needed because layout transpose kernel does not support non-packed tensors + if(problem.IsLayoutDefault() && problem.HasNonPackedTensors()) + return false; + if(!ck_utility::is_ck_whitelist(ctx.GetStream().GetDeviceName())) + return false; + switch(problem.GetInDataType()) + { + case miopenHalf: return CheckCKApplicability(problem); + case miopenFloat: return CheckCKApplicability(problem); + case miopenInt8: return CheckCKApplicability(problem); + case miopenBFloat16: return CheckCKApplicability(problem); + case miopenInt64: + case miopenInt32: + case miopenFloat8: + case miopenBFloat8: + case miopenDouble: break; + } +#endif + return false; +} + +ConvSolution ConvHipImplicitGemm3DGroupFwdXdlops::GetSolution( + [[maybe_unused]] const ExecutionContext& ctx, + [[maybe_unused]] const ProblemDescription& problem, + [[maybe_unused]] const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops& config) const +{ +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL + return MakeSolutionGroupConvImplicitGemmXdlops( + problem, + [&](auto data_type_val) { + using T = decltype(data_type_val); + switch(problem.GetAlphaBetaCase()) + { + case BILINEAR: + return InitInvokerFactoryFwdNCHW<3, + DeviceOpGFwdBilinearPtrs, + CKArgs, + miopen::conv::DataInvokeParams>( + ctx, problem, config.kernel_id); + case SCALE: + return InitInvokerFactoryFwdNCHW<3, + DeviceOpGFwdScalePtrs, + CKArgs, + miopen::conv::DataInvokeParams>( + ctx, problem, config.kernel_id); + default: + return InitInvokerFactoryFwdNCHW<3, + DeviceOpGFwdDefaultPtrs, + CKArgs, + miopen::conv::DataInvokeParams>( + ctx, problem, config.kernel_id); + } + }, + [&](auto data_type_val) { + using T = decltype(data_type_val); + switch(problem.GetAlphaBetaCase()) + { + case BILINEAR: + return InitInvokerFactoryNHWC, + CKArgs, + miopen::conv::DataInvokeParams>( + ctx, problem, config.kernel_id); + case SCALE: + return InitInvokerFactoryNHWC, + CKArgs, + miopen::conv::DataInvokeParams>( + ctx, problem, config.kernel_id); + default: + return InitInvokerFactoryNHWC, + CKArgs, + miopen::conv::DataInvokeParams>( + ctx, problem, config.kernel_id); + } + }); + +#else + return {}; +#endif +} + +} // namespace conv +} // namespace solver +} // namespace miopen From 4f0914c0ab2ecb6e04828c1d94677023e36f3455 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Mon, 5 Aug 2024 23:26:23 +0000 Subject: [PATCH 02/69] adding my solver --- src/include/miopen/solver.hpp | 20 + src/mlo_dir_conv.cpp | 1 + src/solver.cpp | 5 + ...ip_implicit_gemm_3d_grouped_fwd_xdlops.cpp | 112 ++++- ...ip_implicit_gemm_3d_grouped_fwd_xdlops.cpp | 425 +++++------------- 5 files changed, 235 insertions(+), 328 deletions(-) diff --git a/src/include/miopen/solver.hpp b/src/include/miopen/solver.hpp index 891b3d0d5e..f01ee9510f 100644 --- a/src/include/miopen/solver.hpp +++ b/src/include/miopen/solver.hpp @@ -4791,6 +4791,26 @@ struct ConvHipImplicitGemm3DGroupFwdXdlops final bool CheckCKApplicability(const miopen::conv::ProblemDescription&) const; }; +struct ConvHipImplicitGemmGroupFwdXdlopsCodegen final : ConvSolver +{ + // TODO: update this fcn + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( + const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; + + bool MayNeedWorkspace() const override { return true; } + + MIOPEN_INTERNALS_EXPORT bool + IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; + + MIOPEN_INTERNALS_EXPORT ConvSolution + GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; +}; + struct PerformanceConfigHipImplicitGemm3DGroupWrwXdlops : PerfConfigBaseCK { diff --git a/src/mlo_dir_conv.cpp b/src/mlo_dir_conv.cpp index 1b80d0d729..79c56d4840 100644 --- a/src/mlo_dir_conv.cpp +++ b/src/mlo_dir_conv.cpp @@ -119,6 +119,7 @@ static auto GetImplicitGemmSolvers() miopen::solver::conv::ConvHipImplicitGemm3DGroupBwdXdlops, miopen::solver::conv::ConvHipImplicitGemmF16F8F16FwdXdlops, miopen::solver::conv::ConvHipImplicitGemmF16F8F16BwdXdlops, + miopen::solver::conv::ConvHipImplicitGemmGroupFwdXdlopsCodegen, #endif // MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL miopen::solver::conv::ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC>{}; } diff --git a/src/solver.cpp b/src/solver.cpp index e468d38d0a..da347d8d7c 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -671,6 +671,11 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) fusion::ConvWinoFuryRxSFused<2, 3>{}.SolverDbId(), miopenConvolutionAlgoWinograd); + RegisterWithSolver(registry, + ++id, + conv::ConvHipImplicitGemmGroupFwdXdlopsCodegen{}, + miopenConvolutionAlgoImplicitGEMM); + // IMPORTANT: New solvers should be added to the end of the function! } diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp index 183b7045c1..4b3a2f8ed2 100644 --- a/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp +++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp @@ -36,8 +36,6 @@ #include #include #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp" -#include "ck/tensor_operation/gpu/device/matrix_padder.hpp" -#include "ck/tensor_operation/gpu/device/helper.hpp" #endif #include MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS) @@ -343,6 +341,46 @@ struct CKArgs } // namespace +template +void PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::Init(const ProblemDescription& problem) +{ + switch(problem.GetAlphaBetaCase()) + { + case BILINEAR: + valid_kernels = + FillValidKernelsIDs, CKArgs>(problem); + break; + case SCALE: + valid_kernels = + FillValidKernelsIDs, CKArgs>(problem); + break; + default: + valid_kernels = + FillValidKernelsIDs, CKArgs>(problem); + break; + } + index = 0; + kernel_id = valid_kernels[index]; +} + +template +bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::CheckIsSupportCKArgs( + const ProblemDescription& problem) const +{ + switch(problem.GetAlphaBetaCase()) + { + case BILINEAR: + return IsCKArgsSupported, CKArgs>(problem, + kernel_id); + case SCALE: + return IsCKArgsSupported, CKArgs>(problem, + kernel_id); + default: + return IsCKArgsSupported, CKArgs>(problem, + kernel_id); + } +} + template bool ConvHipImplicitGemm3DGroupFwdXdlops::CheckCKApplicability( const ProblemDescription& problem) const @@ -357,6 +395,72 @@ bool ConvHipImplicitGemm3DGroupFwdXdlops::CheckCKApplicability( } #endif +void PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::HeuristicInit( + [[maybe_unused]] const ProblemDescription& problem) +{ + index = 0; + kernel_id = ""; + +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL + switch(problem.GetInDataType()) + { + case miopenHalf: Init(problem); break; + case miopenFloat: Init(problem); break; + case miopenInt8: Init(problem); break; + case miopenBFloat16: Init(problem); break; + case miopenInt64: + case miopenInt32: + case miopenFloat8: + case miopenBFloat8: + case miopenDouble: break; + } +#endif +} + +bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::SetNextValue( + const ProblemDescription& problem) +{ + if(valid_kernels.empty()) + { + HeuristicInit(problem); + assert(!valid_kernels.empty()); + return true; + } + if((index + 1) < valid_kernels.size()) + { + ++index; + kernel_id = valid_kernels[index]; + return true; + } + else + return false; +} + +bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::IsValidValue() const +{ + return index < valid_kernels.size(); +} + +bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::IsValid( + [[maybe_unused]] const ProblemDescription& problem) const +{ +#if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL + switch(problem.GetInDataType()) + { + case miopenHalf: return CheckIsSupportCKArgs(problem); + case miopenFloat: return CheckIsSupportCKArgs(problem); + case miopenInt8: return CheckIsSupportCKArgs(problem); + case miopenBFloat16: return CheckIsSupportCKArgs(problem); + case miopenInt64: + case miopenInt32: + case miopenFloat8: + case miopenBFloat8: + case miopenDouble: break; + } +#endif + return false; +} + bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::operator==( const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops& other) const { @@ -368,7 +472,7 @@ ConvHipImplicitGemm3DGroupFwdXdlops::GetDefaultPerformanceConfig( const ExecutionContext&, const ProblemDescription& problem) const { PerformanceConfigHipImplicitGemm3DGroupFwdXdlops pp; - // pp.HeuristicInit(problem); + pp.HeuristicInit(problem); return pp; } @@ -377,7 +481,7 @@ bool ConvHipImplicitGemm3DGroupFwdXdlops::IsValidPerformanceConfig( const ProblemDescription& problem, const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops& config) const { - // return config.IsValid(problem); + return config.IsValid(problem); } size_t diff --git a/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp b/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp index 183b7045c1..4aaa1ac896 100644 --- a/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp +++ b/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp @@ -36,8 +36,6 @@ #include #include #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp" -#include "ck/tensor_operation/gpu/device/matrix_padder.hpp" -#include "ck/tensor_operation/gpu/device/helper.hpp" #endif #include MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS) @@ -56,263 +54,83 @@ using OutLayout = ck::tensor_layout::convolution::NDH using PassThrough = ck::tensor_operation::element_wise::PassThrough; using Bilinear = ck::tensor_operation::element_wise::Bilinear; using Scale = ck::tensor_operation::element_wise::Scale; -static constexpr ck::index_t NumDimSpatial = 3; +static constexpr ck::index_t NumDimSpatial = 2; -template -using DeviceOpGFwdBilinear = - ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD, - OutLayout, - DataType, - DataType, - ck::Tuple, - DataType, - PassThrough, - PassThrough, - Bilinear>; +const std::string conv_compile_check = R"__ck__( +#include <${include}> -template -using DeviceOpGFwdScale = - ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD, - OutLayout, - DataType, - DataType, - ck::Tuple<>, - DataType, - PassThrough, - PassThrough, - Scale>; +${template}; -template -using DeviceOpGFwdDefault = - ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD, - OutLayout, - DataType, - DataType, - ck::Tuple<>, - DataType, - PassThrough, - PassThrough, - PassThrough>; +)__ck__"; -template -using DeviceOpGFwdBilinearPtrs = - ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOpGFwdBilinear>; +namespace { -template -using DeviceOpGFwdScalePtrs = - ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOpGFwdScale>; +std::string epilogue = R"( +struct Epilogue +{ + __host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){}; -template -using DeviceOpGFwdDefaultPtrs = - ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOpGFwdDefault>; + template + __host__ __device__ constexpr void operator()(E& e, const D& d) const; -namespace { + template <> + __host__ __device__ constexpr void operator()(ck::half_t& e, + const ck::half_t& d) const + { + e = ck::type_convert(alpha_ * e + beta_ * ck::type_convert(d)); + } + + float alpha_; + float beta_; +}; +)"; +std::string prologue = ""; +// TODO: swap out CKArgs with my own ProblemDescription struct?? I need it to call codegen functions +// -> then pass this to CKArgs?? +// jk, just fill out the codegen problem descc according to MIO problem desc ... should be quick template struct CKArgs { CKArgs(const ProblemDescription& problem) { - G = ProblemInterpreter::GetGroupCountG(problem); - N = ProblemInterpreter::GetBatchN(problem); - K1 = ProblemInterpreter::GetOutputChannelK(problem); - C1 = ProblemInterpreter::GetInputChannelC(problem); - C = C1 / G; // Number of input Channel per group - K = K1 / G; // Number of output Channel per group - Hi = ProblemInterpreter::GetInputHeightHi(problem); - Wi = ProblemInterpreter::GetInputWidthWi(problem); - Ho = ProblemInterpreter::GetOutputHeightHo(problem); - Wo = ProblemInterpreter::GetOutputWidthWo(problem); - Y = ProblemInterpreter::GetFilterHeightY(problem); - X = ProblemInterpreter::GetFilterWidthX(problem); - Di = ProblemInterpreter::GetInputDepthDi(problem); - Do = ProblemInterpreter::GetOutputDepthDo(problem); - Z = ProblemInterpreter::GetFilterDepthZ(problem); - alpha_beta_case = ProblemInterpreter::GetAlphaBetaCase(problem); - - in_lengths = {G, N, C, Di, Hi, Wi}; - out_lengths = {G, N, K, Do, Ho, Wo}; - wei_lengths = {G, K, C, Z, Y, X}; - - // CK strides are in GNCDHW order - if(problem.IsLayoutNHWC()) - { - // first entry reserved for G's stride - auto copy_strides = [](const auto& src, auto& dst) { - assert(dst.size() == (src.size() + 1)); - std::copy(src.begin(), src.end(), dst.begin() + 1); - }; - copy_strides(problem.GetIn().GetStrides(), in_strides); - copy_strides(problem.GetOut().GetStrides(), out_strides); - copy_strides(problem.GetWeights().GetStrides(), wei_strides); - - // Now compute G's stride - in_strides[0] = C; - out_strides[0] = K; - wei_strides[0] = K * wei_strides[1]; - } - else - { - assert(problem.IsLayoutDefault()); // already checked in IsApplicable - // for default layout, we produce packed strides for NHWC layout - // because we transpose to NHWC layout before calling CK kernel - in_strides = {C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C}; - out_strides = {K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K}; - wei_strides = {K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C}; - } - - filter_strides = {ProblemInterpreter::GetAdjustedConvolutionStrideD(problem), - ProblemInterpreter::GetAdjustedConvolutionStrideH(problem), - ProblemInterpreter::GetAdjustedConvolutionStrideW(problem)}; - filter_dilations = {ProblemInterpreter::GetAdjustedConvolutionDilationD(problem), - ProblemInterpreter::GetAdjustedConvolutionDilationH(problem), - ProblemInterpreter::GetAdjustedConvolutionDilationW(problem)}; - lPadding = {ProblemInterpreter::GetInputLeftPadD(problem), - ProblemInterpreter::GetInputLeftPadH(problem), + + ck::host::conv::Problem_Conv_Fwd prob; + prob.NumDim = NumDimSpatial; + prob.G = ProblemInterpreter::GetGroupCountG(problem); + prob.N = ProblemInterpreter::GetBatchN(problem); + K1 = ProblemInterpreter::GetOutputChannelK(problem); + C1 = ProblemInterpreter::GetInputChannelC(problem); + prob.C = C1 / G; // Number of input Channel per group + prob.K = K1 / G; // Number of output Channel per group + prob.Y = ProblemInterpreter::GetFilterHeightY(problem); + prob.X = ProblemInterpreter::GetFilterWidthX(problem); + prob.Hi = ProblemInterpreter::GetInputHeightHi(problem); + prob.Wi = ProblemInterpreter::GetInputWidthWi(problem); + prob.Ho = ProblemInterpreter::GetOutputHeightHo(problem); + prob.Wo = ProblemInterpreter::GetOutputWidthWo(problem); + + input = {G, N, C, Hi, Wi}; + output = {G, N, K, Ho, Wo}; + weight = {G, K, C, Y, X}; + + in_strides = {C, Hi * Wi * G * C, 1, Wi * G * C, G * C}; + out_strides = {K, Ho * Wo * G * K, 1, Wo * G * K, G * K}; + wei_strides = {K * Y * X * C, Y * X * C, 1, X * C, C}; + strides = {ProblemInterpreter::GetAdjustedConvolutionStrideH(problem), + ProblemInterpreter::GetAdjustedConvolutionStrideW(problem)}; + dilation = {ProblemInterpreter::GetAdjustedConvolutionDilationH(problem), + ProblemInterpreter::GetAdjustedConvolutionDilationW(problem)}; + lPadding = {ProblemInterpreter::GetInputLeftPadH(problem), ProblemInterpreter::GetInputLeftPadW(problem)}; - rPadding = {ProblemInterpreter::GetAdjustedInputRightPadD(problem), - ProblemInterpreter::GetAdjustedInputRightPadH(problem), + rPadding = {ProblemInterpreter::GetAdjustedInputRightPadH(problem), ProblemInterpreter::GetAdjustedInputRightPadW(problem)}; } - CKArgs(const CKArgs&) = default; + /**CKArgs(const CKArgs&) = default; CKArgs(CKArgs&&) noexcept = default; CKArgs& operator=(const CKArgs&) = default; - template - auto MakeArgPtr(const ConvPtr& conv_ptr, - ConstData_t in, - ConstData_t w, - Data_t out, - float alpha, - float beta) const - { - using DeviceP = std::remove_pointer_t; - if constexpr(std::is_same_v>) - { - return MakeBilinearArgPtr(conv_ptr, in, w, out, alpha, beta); - } - else if constexpr(std::is_same_v>) - { - (void)beta; - return MakeScaleArgPtr(conv_ptr, in, w, out, alpha); - } - else - { - (void)alpha; - (void)beta; - static_assert(std::is_same_v>, - "Default should be fwd pass through"); - return MakeDefaultArgPtr(conv_ptr, in, w, out); - } - } - - template - auto MakeBilinearArgPtr(const ConvPtr& conv_ptr, - ConstData_t in, - ConstData_t w, - Data_t out, - float alpha, - float beta) const - { - return conv_ptr->MakeArgumentPointer(in, - w, - {out}, - out, - in_lengths, - in_strides, - wei_lengths, - wei_strides, - {out_lengths}, - {out_strides}, - out_lengths, - out_strides, - filter_strides, - filter_dilations, - lPadding, - rPadding, - PassThrough{}, - PassThrough{}, - Bilinear{alpha, beta}); - } - - template - auto MakeScaleArgPtr( - const ConvPtr& conv_ptr, ConstData_t in, ConstData_t w, Data_t out, float alpha) const - { - return conv_ptr->MakeArgumentPointer(in, - w, - {}, - out, - in_lengths, - in_strides, - wei_lengths, - wei_strides, - {}, - {}, - out_lengths, - out_strides, - filter_strides, - filter_dilations, - lPadding, - rPadding, - PassThrough{}, - PassThrough{}, - Scale{alpha}); - } - - template - auto MakeDefaultArgPtr(const ConvPtr& conv_ptr, ConstData_t in, ConstData_t w, Data_t out) const - { - return conv_ptr->MakeArgumentPointer(in, - w, - {}, - out, - in_lengths, - in_strides, - wei_lengths, - wei_strides, - {}, - {}, - out_lengths, - out_strides, - filter_strides, - filter_dilations, - lPadding, - rPadding, - PassThrough{}, - PassThrough{}, - PassThrough{}); - } - - template - auto MakeArgPtr(const ConvPtr& conv_ptr, - const ConvDataTensors& tensors, - float alpha, - float beta) const - { - return MakeArgPtr(conv_ptr, tensors.in, tensors.w, tensors.out, alpha, beta); - } - - template - bool IsSupportedBy(const ConvPtr& conv_ptr) const - { - auto arg_ptr = MakeArgPtr(conv_ptr, nullptr, nullptr, nullptr, 1.0f, 0.0f); - return conv_ptr->IsSupportedArgument(arg_ptr.get()); - } - int G; int N; int K; @@ -338,48 +156,13 @@ struct CKArgs std::array filter_dilations; std::array lPadding; std::array rPadding; - miopenAlphaBetaCase_t alpha_beta_case; + miopenAlphaBetaCase_t alpha_beta_case;**/ }; } // namespace -template -bool ConvHipImplicitGemm3DGroupFwdXdlops::CheckCKApplicability( - const ProblemDescription& problem) const -{ - switch(problem.GetAlphaBetaCase()) - { - case BILINEAR: - return IsCKApplicable, CKArgs>(problem); - case SCALE: return IsCKApplicable, CKArgs>(problem); - default: return IsCKApplicable, CKArgs>(problem); - } -} #endif -bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::operator==( - const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops& other) const -{ - return kernel_id == other.kernel_id; -} - -PerformanceConfigHipImplicitGemm3DGroupFwdXdlops -ConvHipImplicitGemm3DGroupFwdXdlops::GetDefaultPerformanceConfig( - const ExecutionContext&, const ProblemDescription& problem) const -{ - PerformanceConfigHipImplicitGemm3DGroupFwdXdlops pp; - // pp.HeuristicInit(problem); - return pp; -} - -bool ConvHipImplicitGemm3DGroupFwdXdlops::IsValidPerformanceConfig( - const ExecutionContext&, - const ProblemDescription& problem, - const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops& config) const -{ - // return config.IsValid(problem); -} - size_t ConvHipImplicitGemm3DGroupFwdXdlops::GetWorkspaceSize(const ExecutionContext&, const ProblemDescription& problem) const @@ -387,13 +170,13 @@ ConvHipImplicitGemm3DGroupFwdXdlops::GetWorkspaceSize(const ExecutionContext&, return GetWorkspaceSizeLayoutTransformConv(problem); } -PerformanceConfigHipImplicitGemm3DGroupFwdXdlops +/**PerformanceConfigHipImplicitGemm3DGroupFwdXdlops ConvHipImplicitGemm3DGroupFwdXdlops::Search(const ExecutionContext& ctx, const ProblemDescription& problem, const AnyInvokeParams& invoke_ctx) const { return GenericSearch(*this, ctx, problem, invoke_ctx); -} +}**/ bool ConvHipImplicitGemm3DGroupFwdXdlops::IsApplicable( [[maybe_unused]] const ExecutionContext& ctx, @@ -442,53 +225,47 @@ ConvSolution ConvHipImplicitGemm3DGroupFwdXdlops::GetSolution( [[maybe_unused]] const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops& config) const { #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL - return MakeSolutionGroupConvImplicitGemmXdlops( - problem, - [&](auto data_type_val) { - using T = decltype(data_type_val); - switch(problem.GetAlphaBetaCase()) - { - case BILINEAR: - return InitInvokerFactoryFwdNCHW<3, - DeviceOpGFwdBilinearPtrs, - CKArgs, - miopen::conv::DataInvokeParams>( - ctx, problem, config.kernel_id); - case SCALE: - return InitInvokerFactoryFwdNCHW<3, - DeviceOpGFwdScalePtrs, - CKArgs, - miopen::conv::DataInvokeParams>( - ctx, problem, config.kernel_id); - default: - return InitInvokerFactoryFwdNCHW<3, - DeviceOpGFwdDefaultPtrs, - CKArgs, - miopen::conv::DataInvokeParams>( - ctx, problem, config.kernel_id); - } - }, - [&](auto data_type_val) { - using T = decltype(data_type_val); - switch(problem.GetAlphaBetaCase()) - { - case BILINEAR: - return InitInvokerFactoryNHWC, - CKArgs, - miopen::conv::DataInvokeParams>( - ctx, problem, config.kernel_id); - case SCALE: - return InitInvokerFactoryNHWC, - CKArgs, - miopen::conv::DataInvokeParams>( - ctx, problem, config.kernel_id); - default: - return InitInvokerFactoryNHWC, - CKArgs, - miopen::conv::DataInvokeParams>( - ctx, problem, config.kernel_id); - } - }); + auto in_dev = + to_gpu(generate_buffer>(in_lengths, in_strides, 0)); + auto wei_dev = + to_gpu(generate_buffer>(wei_lengths, wei_strides, 1)); + auto out_dev = + to_gpu(generate_buffer>(out_lengths, out_strides, 2)); + + auto solution : prob.GetSolutions("gfx908", prologue, epilogue); + // substitute instance values into the template + auto src = ck::host::InterpolateString( + conv_compile_check, + {{"include", prob.GetIncludeHeader()}, {"template", solution[0].ToTemplateString()}}); + + auto srcs = get_headers_for_test(); + srcs.push_back({"main.cpp", src}); + rtc::compile_options options; + auto name = solution[0].GetTemplateParameter("name"); + options.kernel_name = "run_" + name; + auto k = rtc::compile_kernel(srcs, options); + + // Grid size calculation + auto block_size = solution[0].GetTemplateParameter("BlockSize"); + + auto tmp = get_launch_params(solution[0], out_lengths, out_strides); + + auto grid_size = tmp * in_lengths[1]; + + // launch the kernel with arguments needed for the argument pointer + k.launch(nullptr, grid_size * block_size, block_size)(in_dev.data(), + wei_dev.data(), + out_dev.data(), + in_lengths, + in_strides, + wei_lengths, + wei_strides, + out_lengths, + out_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads); #else return {}; From 21cb29325610b53bc63f536f2f9ddf25c7f90ccd Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Wed, 7 Aug 2024 07:51:44 +0000 Subject: [PATCH 03/69] cleared up compiler issues, waiting on codegen build with CK --- src/CMakeLists.txt | 1 + ...ip_implicit_gemm_3d_grouped_fwd_xdlops.cpp | 147 ++++++++++-------- 2 files changed, 79 insertions(+), 69 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c5fcbf9220..13b5a890b8 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -298,6 +298,7 @@ set( MIOpen_Source solver/reduce/forward_sum.cpp solver/softmax/attn_softmax.cpp solver/softmax/softmax.cpp + solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp subbuffers.cpp sum_api.cpp t5layernorm_api.cpp diff --git a/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp b/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp index 4aaa1ac896..79c28d72d0 100644 --- a/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp +++ b/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp @@ -86,10 +86,7 @@ struct Epilogue )"; std::string prologue = ""; -// TODO: swap out CKArgs with my own ProblemDescription struct?? I need it to call codegen functions -// -> then pass this to CKArgs?? -// jk, just fill out the codegen problem descc according to MIO problem desc ... should be quick -template +// template struct CKArgs { CKArgs(const ProblemDescription& problem) @@ -99,10 +96,10 @@ struct CKArgs prob.NumDim = NumDimSpatial; prob.G = ProblemInterpreter::GetGroupCountG(problem); prob.N = ProblemInterpreter::GetBatchN(problem); - K1 = ProblemInterpreter::GetOutputChannelK(problem); - C1 = ProblemInterpreter::GetInputChannelC(problem); - prob.C = C1 / G; // Number of input Channel per group - prob.K = K1 / G; // Number of output Channel per group + int K1 = ProblemInterpreter::GetOutputChannelK(problem); + int C1 = ProblemInterpreter::GetInputChannelC(problem); + prob.C = C1 / prob.G; // Number of input Channel per group + prob.K = K1 / prob.G; // Number of output Channel per group prob.Y = ProblemInterpreter::GetFilterHeightY(problem); prob.X = ProblemInterpreter::GetFilterWidthX(problem); prob.Hi = ProblemInterpreter::GetInputHeightHi(problem); @@ -110,20 +107,32 @@ struct CKArgs prob.Ho = ProblemInterpreter::GetOutputHeightHo(problem); prob.Wo = ProblemInterpreter::GetOutputWidthWo(problem); - input = {G, N, C, Hi, Wi}; - output = {G, N, K, Ho, Wo}; - weight = {G, K, C, Y, X}; - - in_strides = {C, Hi * Wi * G * C, 1, Wi * G * C, G * C}; - out_strides = {K, Ho * Wo * G * K, 1, Wo * G * K, G * K}; - wei_strides = {K * Y * X * C, Y * X * C, 1, X * C, C}; - strides = {ProblemInterpreter::GetAdjustedConvolutionStrideH(problem), - ProblemInterpreter::GetAdjustedConvolutionStrideW(problem)}; - dilation = {ProblemInterpreter::GetAdjustedConvolutionDilationH(problem), - ProblemInterpreter::GetAdjustedConvolutionDilationW(problem)}; - lPadding = {ProblemInterpreter::GetInputLeftPadH(problem), + in_lengths = {prob.G, prob.N, prob.C, prob.Hi, prob.Wi}; + out_lengths = {prob.G, prob.N, prob.K, prob.Ho, prob.Wo}; + wei_lengths = {prob.G, prob.K, prob.C, prob.Y, prob.X}; + + in_strides = {prob.C, + prob.Hi * prob.Wi * prob.G * prob.C, + 1, + prob.Wi * prob.G * prob.C, + prob.G * prob.C}; + out_strides = {prob.K, + prob.Ho * prob.Wo * prob.G * prob.K, + 1, + prob.Wo * prob.G * prob.K, + prob.G * prob.K}; + wei_strides = {prob.K * prob.Y * prob.X * prob.C, + prob.Y * prob.X * prob.C, + 1, + prob.X * prob.C, + prob.C}; + filter_strides = {ProblemInterpreter::GetAdjustedConvolutionStrideH(problem), + ProblemInterpreter::GetAdjustedConvolutionStrideW(problem)}; + filter_dilations = {ProblemInterpreter::GetAdjustedConvolutionDilationH(problem), + ProblemInterpreter::GetAdjustedConvolutionDilationW(problem)}; + lPadding = {ProblemInterpreter::GetInputLeftPadH(problem), ProblemInterpreter::GetInputLeftPadW(problem)}; - rPadding = {ProblemInterpreter::GetAdjustedInputRightPadH(problem), + rPadding = {ProblemInterpreter::GetAdjustedInputRightPadH(problem), ProblemInterpreter::GetAdjustedInputRightPadW(problem)}; } @@ -145,18 +154,18 @@ struct CKArgs int Do; int Y; int X; - int Z; - std::array in_lengths; - std::array in_strides; - std::array out_lengths; - std::array out_strides; - std::array wei_lengths; - std::array wei_strides; - std::array filter_strides; - std::array filter_dilations; - std::array lPadding; - std::array rPadding; - miopenAlphaBetaCase_t alpha_beta_case;**/ + int Z;**/ + std::array in_lengths; + std::array in_strides; + std::array out_lengths; + std::array out_strides; + std::array wei_lengths; + std::array wei_strides; + std::array filter_strides; + std::array filter_dilations; + std::array lPadding; + std::array rPadding; + // miopenAlphaBetaCase_t alpha_beta_case; }; } // namespace @@ -164,8 +173,8 @@ struct CKArgs #endif size_t -ConvHipImplicitGemm3DGroupFwdXdlops::GetWorkspaceSize(const ExecutionContext&, - const ProblemDescription& problem) const +ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetWorkspaceSize(const ExecutionContext&, + const ProblemDescription& problem) const { return GetWorkspaceSizeLayoutTransformConv(problem); } @@ -178,7 +187,7 @@ ConvHipImplicitGemm3DGroupFwdXdlops::Search(const ExecutionContext& ctx, return GenericSearch(*this, ctx, problem, invoke_ctx); }**/ -bool ConvHipImplicitGemm3DGroupFwdXdlops::IsApplicable( +bool ConvHipImplicitGemmGroupFwdXdlopsCodegen::IsApplicable( [[maybe_unused]] const ExecutionContext& ctx, [[maybe_unused]] const ProblemDescription& problem) const { @@ -203,34 +212,34 @@ bool ConvHipImplicitGemm3DGroupFwdXdlops::IsApplicable( return false; if(!ck_utility::is_ck_whitelist(ctx.GetStream().GetDeviceName())) return false; - switch(problem.GetInDataType()) - { - case miopenHalf: return CheckCKApplicability(problem); - case miopenFloat: return CheckCKApplicability(problem); - case miopenInt8: return CheckCKApplicability(problem); - case miopenBFloat16: return CheckCKApplicability(problem); - case miopenInt64: - case miopenInt32: - case miopenFloat8: - case miopenBFloat8: - case miopenDouble: break; - } + /**switch(problem.GetInDataType()) + { + case miopenHalf: return CheckCKApplicability(problem); + case miopenFloat: return CheckCKApplicability(problem); + case miopenInt8: return CheckCKApplicability(problem); + case miopenBFloat16: return CheckCKApplicability(problem); + case miopenInt64: + case miopenInt32: + case miopenFloat8: + case miopenBFloat8: + case miopenDouble: break; + }**/ #endif return false; } -ConvSolution ConvHipImplicitGemm3DGroupFwdXdlops::GetSolution( +ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( [[maybe_unused]] const ExecutionContext& ctx, - [[maybe_unused]] const ProblemDescription& problem, - [[maybe_unused]] const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops& config) const + [[maybe_unused]] const ProblemDescription& problem) const { + auto prob = CKArgs(problem); #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL - auto in_dev = - to_gpu(generate_buffer>(in_lengths, in_strides, 0)); - auto wei_dev = - to_gpu(generate_buffer>(wei_lengths, wei_strides, 1)); - auto out_dev = - to_gpu(generate_buffer>(out_lengths, out_strides, 2)); + auto in_dev = to_gpu(generate_buffer>( + prob.in_lengths, prob.in_strides, 0)); + auto wei_dev = to_gpu(generate_buffer>( + prob.wei_lengths, prob.wei_strides, 1)); + auto out_dev = to_gpu(generate_buffer>( + prob.out_lengths, prob.out_strides, 2)); auto solution : prob.GetSolutions("gfx908", prologue, epilogue); // substitute instance values into the template @@ -248,24 +257,24 @@ ConvSolution ConvHipImplicitGemm3DGroupFwdXdlops::GetSolution( // Grid size calculation auto block_size = solution[0].GetTemplateParameter("BlockSize"); - auto tmp = get_launch_params(solution[0], out_lengths, out_strides); + auto tmp = get_launch_params(solution[0], prob.out_lengths, prob.out_strides); - auto grid_size = tmp * in_lengths[1]; + auto grid_size = tmp * prob.in_lengths[1]; // launch the kernel with arguments needed for the argument pointer k.launch(nullptr, grid_size * block_size, block_size)(in_dev.data(), wei_dev.data(), out_dev.data(), - in_lengths, - in_strides, - wei_lengths, - wei_strides, - out_lengths, - out_strides, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads); + prob.in_lengths, + prob.in_strides, + prob.wei_lengths, + prob.wei_strides, + prob.out_lengths, + prob.out_strides, + prob.filter_strides, + prob.filter_dilations, + prob.lPadding, + prob.rPadding); #else return {}; From fa5aab1828c87e93d78b5537467fab3562fb3b60 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Mon, 12 Aug 2024 18:53:35 +0000 Subject: [PATCH 04/69] cleaned up code --- src/CMakeLists.txt | 1 + ...ip_implicit_gemm_3d_grouped_fwd_xdlops.cpp | 25 ++----------------- 2 files changed, 3 insertions(+), 23 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 13b5a890b8..089329aced 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -62,6 +62,7 @@ if(NOT MIOPEN_GENERATOR_IS_MULTI_CONFIG) message( STATUS "CMAKE_BUILD_TYPE= ${CMAKE_BUILD_TYPE}" ) endif() +message("CK include: ${COMPOSABLE_KERNEL_INCLUDE}") # This is incremented when the ABI to the library changes set( MIOpen_SOVERSION 1.0 ) diff --git a/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp b/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp index 79c28d72d0..a005e03bd1 100644 --- a/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp +++ b/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp @@ -86,7 +86,6 @@ struct Epilogue )"; std::string prologue = ""; -// template struct CKArgs { CKArgs(const ProblemDescription& problem) @@ -136,11 +135,11 @@ struct CKArgs ProblemInterpreter::GetAdjustedInputRightPadW(problem)}; } - /**CKArgs(const CKArgs&) = default; + CKArgs(const CKArgs&) = default; CKArgs(CKArgs&&) noexcept = default; CKArgs& operator=(const CKArgs&) = default; - int G; + /**int G; int N; int K; int C; @@ -179,14 +178,6 @@ ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetWorkspaceSize(const ExecutionContex return GetWorkspaceSizeLayoutTransformConv(problem); } -/**PerformanceConfigHipImplicitGemm3DGroupFwdXdlops -ConvHipImplicitGemm3DGroupFwdXdlops::Search(const ExecutionContext& ctx, - const ProblemDescription& problem, - const AnyInvokeParams& invoke_ctx) const -{ - return GenericSearch(*this, ctx, problem, invoke_ctx); -}**/ - bool ConvHipImplicitGemmGroupFwdXdlopsCodegen::IsApplicable( [[maybe_unused]] const ExecutionContext& ctx, [[maybe_unused]] const ProblemDescription& problem) const @@ -212,18 +203,6 @@ bool ConvHipImplicitGemmGroupFwdXdlopsCodegen::IsApplicable( return false; if(!ck_utility::is_ck_whitelist(ctx.GetStream().GetDeviceName())) return false; - /**switch(problem.GetInDataType()) - { - case miopenHalf: return CheckCKApplicability(problem); - case miopenFloat: return CheckCKApplicability(problem); - case miopenInt8: return CheckCKApplicability(problem); - case miopenBFloat16: return CheckCKApplicability(problem); - case miopenInt64: - case miopenInt32: - case miopenFloat8: - case miopenBFloat8: - case miopenDouble: break; - }**/ #endif return false; } From 183e041ce12bed2f538795718c48f07798c56cc7 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Wed, 14 Aug 2024 22:18:40 +0000 Subject: [PATCH 05/69] update requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 5e19e14afc..8c78fe300d 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@3e9711f0cb1c7ffd3826a93dfa6dd65e98715636 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON +ROCm/composable_kernel@da214a5a58fc232cbed2bbc2bef6156f49057c40 -DCMAKE_BUILD_TYPE=Release google/googletest@v1.14.0 From 4eede0fc06a75c2822b372bd8b81858744031ef7 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Thu, 15 Aug 2024 19:07:12 +0000 Subject: [PATCH 06/69] testing new CK build --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8c78fe300d..5bbc2b9a80 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@da214a5a58fc232cbed2bbc2bef6156f49057c40 -DCMAKE_BUILD_TYPE=Release +ROCm/composable_kernel@272ebcb5ec8e4441de88abddded15144bc04597f -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON google/googletest@v1.14.0 From 8e8962a202205b0f6ceb50cff9f1f925c709ea61 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Mon, 19 Aug 2024 21:18:46 +0000 Subject: [PATCH 07/69] adding ck_rtc build --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 5bbc2b9a80..3fae5d9880 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@272ebcb5ec8e4441de88abddded15144bc04597f -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON +ROCm/composable_kernel@d95d761dd9ac18f890366dd9d78b5fbd8030d147 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON google/googletest@v1.14.0 From 7137fa252e1b09b8c362305d5794e217dd061ef2 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Thu, 22 Aug 2024 15:49:18 +0000 Subject: [PATCH 08/69] instances only build --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 3fae5d9880..399ab0b423 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@d95d761dd9ac18f890366dd9d78b5fbd8030d147 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON +ROCm/composable_kernel@967b1f0fda8bbbae5f1a9de71d3100380d04a068 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON google/googletest@v1.14.0 From 3e10af6fe038a764f0e055b3db2ed2bd521a1038 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Mon, 26 Aug 2024 20:46:51 +0000 Subject: [PATCH 09/69] updated rtc build --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 399ab0b423..78fea353d4 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@967b1f0fda8bbbae5f1a9de71d3100380d04a068 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON +ROCm/composable_kernel@25935b57a0eac3284b450c2a34a1914a49eca077 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON google/googletest@v1.14.0 From 4e2d4807bb05c089eae0046e73a05451af62e2c3 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Mon, 26 Aug 2024 22:54:16 +0000 Subject: [PATCH 10/69] updated source file compilation, solver compiling, need to fix invoker --- ...ip_implicit_gemm_3d_grouped_fwd_xdlops.cpp | 101 +++++++++++++----- 1 file changed, 77 insertions(+), 24 deletions(-) diff --git a/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp b/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp index a005e03bd1..d9482f645f 100644 --- a/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp +++ b/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp @@ -31,11 +31,26 @@ #include #include #include +#include #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL #include #include #include #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp" + +#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp" +#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp" +#include "ck/host/headers.hpp" +#include "ck/host/stringutils.hpp" +#include "ck/host/utils.hpp" +#include "ck/tensor_operation/gpu/device/helper.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" +//#include +//#include +//#include +//#include "common.hpp" +#include #endif #include MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS) @@ -86,12 +101,28 @@ struct Epilogue )"; std::string prologue = ""; +// TODO: temporarily have these two here due to build issues with ck_rtc, remove once resolved +struct src_file +{ + std::filesystem::path path; + std::string_view content; +}; +std::vector get_headers_for_test() +{ + std::vector result; + auto hs = ck::host::GetHeaders(); + std::transform( + hs.begin(), hs.end(), std::back_inserter(result), [&](const auto& p) -> src_file { + return {p.first, p.second}; + }); + return result; +} + struct CKArgs { CKArgs(const ProblemDescription& problem) { - ck::host::conv::Problem_Conv_Fwd prob; prob.NumDim = NumDimSpatial; prob.G = ProblemInterpreter::GetGroupCountG(problem); prob.N = ProblemInterpreter::GetBatchN(problem); @@ -154,16 +185,17 @@ struct CKArgs int Y; int X; int Z;**/ - std::array in_lengths; - std::array in_strides; - std::array out_lengths; - std::array out_strides; - std::array wei_lengths; - std::array wei_strides; - std::array filter_strides; - std::array filter_dilations; - std::array lPadding; - std::array rPadding; + ck::host::conv::Problem_Conv_Fwd prob; + ck::Array in_lengths; + ck::Array in_strides; + ck::Array out_lengths; + ck::Array out_strides; + ck::Array wei_lengths; + ck::Array wei_strides; + ck::Array filter_strides; + ck::Array filter_dilations; + ck::Array lPadding; + ck::Array rPadding; // miopenAlphaBetaCase_t alpha_beta_case; }; @@ -211,37 +243,58 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( [[maybe_unused]] const ExecutionContext& ctx, [[maybe_unused]] const ProblemDescription& problem) const { - auto prob = CKArgs(problem); + auto x = CKArgs(problem); #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL - auto in_dev = to_gpu(generate_buffer>( + // TODO: update this: the user will provide the data, I just need to access the buffer, not + // create it + /**auto in_dev = to_gpu(generate_buffer>( prob.in_lengths, prob.in_strides, 0)); auto wei_dev = to_gpu(generate_buffer>( prob.wei_lengths, prob.wei_strides, 1)); auto out_dev = to_gpu(generate_buffer>( - prob.out_lengths, prob.out_strides, 2)); + prob.out_lengths, prob.out_strides, 2));**/ + + const auto workspace_req = GetWorkspaceSize(ctx, problem); + + auto soln = ConvSolution{miopenStatusSuccess}; + soln.workspace_sz = workspace_req; - auto solution : prob.GetSolutions("gfx908", prologue, epilogue); + auto solution = x.prob.GetSolutions("gfx908", prologue, epilogue); // substitute instance values into the template auto src = ck::host::InterpolateString( conv_compile_check, - {{"include", prob.GetIncludeHeader()}, {"template", solution[0].ToTemplateString()}}); + {{"include", x.prob.GetIncludeHeader()}, {"template", solution[0].ToTemplateString()}}); auto srcs = get_headers_for_test(); srcs.push_back({"main.cpp", src}); - rtc::compile_options options; - auto name = solution[0].GetTemplateParameter("name"); - options.kernel_name = "run_" + name; - auto k = rtc::compile_kernel(srcs, options); + auto kernel = KernelInfo{}; + auto name = solution[0].GetTemplateParameter("name"); + // FIXME: is this how to pass the src files? + kernel.kernel_file = srcs[srcs.size() - 1].path.filename().string(); + kernel.kernel_name = "run_" + name; + // rtc::compile_options options; + // auto name = solution[0].GetTemplateParameter("name"); + // options.kernel_name = "run_" + name; + // TODO: MIOpen has it's own handlers for compilation + // auto k = rtc::compile_kernel(srcs, options); // Grid size calculation auto block_size = solution[0].GetTemplateParameter("BlockSize"); - auto tmp = get_launch_params(solution[0], prob.out_lengths, prob.out_strides); + auto tmp = get_launch_params(solution[0], x.out_lengths, x.out_strides); + + auto grid_size = tmp * x.in_lengths[1]; + + /**bool bfp16parm = true; + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(bfp16parm)}}; + kernel.comp_options = build_params.GenerateFor(kbp::HIP{});**/ - auto grid_size = tmp * prob.in_lengths[1]; + soln.construction_params.push_back(kernel); + // TODO: remove this, replace with lambda. MIOpen has it's own invoker to launch the kernel // launch the kernel with arguments needed for the argument pointer - k.launch(nullptr, grid_size * block_size, block_size)(in_dev.data(), + /**k.launch(nullptr, grid_size * block_size, block_size)(in_dev.data(), wei_dev.data(), out_dev.data(), prob.in_lengths, @@ -253,7 +306,7 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( prob.filter_strides, prob.filter_dilations, prob.lPadding, - prob.rPadding); + prob.rPadding);**/ #else return {}; From 463e52374e6752c8f4375da88fcd27b10c4e5b41 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Thu, 29 Aug 2024 18:52:59 +0000 Subject: [PATCH 11/69] added invoker, solver compiling but gets skipped --- ...ip_implicit_gemm_3d_grouped_fwd_xdlops.cpp | 43 +++++++++++++------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp b/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp index d9482f645f..767a1c7b10 100644 --- a/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp +++ b/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp @@ -245,14 +245,10 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( { auto x = CKArgs(problem); #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL - // TODO: update this: the user will provide the data, I just need to access the buffer, not - // create it - /**auto in_dev = to_gpu(generate_buffer>( - prob.in_lengths, prob.in_strides, 0)); - auto wei_dev = to_gpu(generate_buffer>( - prob.wei_lengths, prob.wei_strides, 1)); - auto out_dev = to_gpu(generate_buffer>( - prob.out_lengths, prob.out_strides, 2));**/ + /**decltype(auto) conv = problem.GetConv(); + decltype(auto) in = problem.GetIn(); + decltype(auto) wei = problem.GetWeights(); + decltype(auto) out = problem.GetOut();**/ const auto workspace_req = GetWorkspaceSize(ctx, problem); @@ -264,12 +260,11 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( auto src = ck::host::InterpolateString( conv_compile_check, {{"include", x.prob.GetIncludeHeader()}, {"template", solution[0].ToTemplateString()}}); - auto srcs = get_headers_for_test(); srcs.push_back({"main.cpp", src}); - auto kernel = KernelInfo{}; - auto name = solution[0].GetTemplateParameter("name"); - // FIXME: is this how to pass the src files? + auto name = solution[0].GetTemplateParameter("name"); + + auto kernel = KernelInfo{}; kernel.kernel_file = srcs[srcs.size() - 1].path.filename().string(); kernel.kernel_name = "run_" + name; // rtc::compile_options options; @@ -285,6 +280,9 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( auto grid_size = tmp * x.in_lengths[1]; + kernel.l_wk = {block_size, 1, 1}; + kernel.g_wk = {block_size * grid_size, 1, 1}; + /**bool bfp16parm = true; const auto build_params = KernelBuildParameters{ {"MIOPEN_USE_FP16", static_cast(bfp16parm)}}; @@ -292,6 +290,26 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( soln.construction_params.push_back(kernel); + soln.invoker_factory = [=](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + kernel(params.tensors.in, + params.tensors.w, + params.tensors.out, + x.in_lengths, + x.in_strides, + x.wei_lengths, + x.wei_strides, + x.out_lengths, + x.out_strides, + x.filter_strides, + x.filter_dilations, + x.lPadding, + x.rPadding); + }; + }; // TODO: remove this, replace with lambda. MIOpen has it's own invoker to launch the kernel // launch the kernel with arguments needed for the argument pointer /**k.launch(nullptr, grid_size * block_size, block_size)(in_dev.data(), @@ -308,6 +326,7 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( prob.lPadding, prob.rPadding);**/ + return soln; #else return {}; #endif From 3d0418a2a0333dbac9eb2befec4933220cae50b0 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Tue, 3 Sep 2024 16:41:24 +0000 Subject: [PATCH 12/69] formatting --- src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp b/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp index 767a1c7b10..fc8401f877 100644 --- a/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp +++ b/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp @@ -251,6 +251,7 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( decltype(auto) out = problem.GetOut();**/ const auto workspace_req = GetWorkspaceSize(ctx, problem); + std::cout << "workspace: " << workspace_req << std::endl; auto soln = ConvSolution{miopenStatusSuccess}; soln.workspace_sz = workspace_req; From d60b71970173a038ebf762581a8eeb91af16b7da Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Thu, 5 Sep 2024 08:05:54 +0000 Subject: [PATCH 13/69] updated codegen build --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 78fea353d4..bf0c967e00 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@25935b57a0eac3284b450c2a34a1914a49eca077 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON +ROCm/composable_kernel@652a7c046381526947f507a89299aa92d89dbd02 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON google/googletest@v1.14.0 From 1c2fe7287afdcc98a44ccd300797ae59fefc2719 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Fri, 6 Sep 2024 00:20:19 +0000 Subject: [PATCH 14/69] updating CMakes to include ck_host component from CK --- CMakeLists.txt | 2 +- src/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6d42e141df..c77d2ed530 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -303,7 +303,7 @@ add_compile_definitions($<$:HIP_COMPILER_FLAGS=${HIP_COMPI # HIP if( MIOPEN_BACKEND STREQUAL "HIP" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN_BACKEND STREQUAL "HIPNOGPU") if(MIOPEN_USE_COMPOSABLEKERNEL) - find_package(composable_kernel 1.0.0 COMPONENTS device_other_operations device_gemm_operations device_conv_operations device_reduction_operations) + find_package(composable_kernel 1.0.0 COMPONENTS device_other_operations device_gemm_operations device_conv_operations device_reduction_operations ck_host) endif() if( MIOPEN_BACKEND STREQUAL "HIPNOGPU") set(MIOPEN_MODE_NOGPU 1) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 089329aced..b6e00015c9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -785,7 +785,7 @@ target_include_directories(MIOpen PUBLIC ) if(MIOPEN_USE_COMPOSABLEKERNEL) -set(MIOPEN_CK_LINK_FLAGS composable_kernel::device_other_operations composable_kernel::device_gemm_operations composable_kernel::device_conv_operations composable_kernel::device_reduction_operations hip::host) +set(MIOPEN_CK_LINK_FLAGS composable_kernel::device_other_operations composable_kernel::device_gemm_operations composable_kernel::device_conv_operations composable_kernel::device_reduction_operations composable_kernel::ck_host hip::host) endif() if(WIN32) From 5549cfd4f39b50d13f79f00edcde21fa2ecf9f36 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Fri, 6 Sep 2024 00:22:49 +0000 Subject: [PATCH 15/69] removing some includes --- .../tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp b/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp index fc8401f877..0a11ec0b61 100644 --- a/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp +++ b/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp @@ -46,9 +46,6 @@ #include "ck/tensor_operation/gpu/device/helper.hpp" #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" -//#include -//#include -//#include //#include "common.hpp" #include #endif @@ -214,6 +211,8 @@ bool ConvHipImplicitGemmGroupFwdXdlopsCodegen::IsApplicable( [[maybe_unused]] const ExecutionContext& ctx, [[maybe_unused]] const ProblemDescription& problem) const { + // FIXME: rewrite this function + return true; #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL if(env::disabled(MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS)) return false; @@ -233,8 +232,6 @@ bool ConvHipImplicitGemmGroupFwdXdlopsCodegen::IsApplicable( // needed because layout transpose kernel does not support non-packed tensors if(problem.IsLayoutDefault() && problem.HasNonPackedTensors()) return false; - if(!ck_utility::is_ck_whitelist(ctx.GetStream().GetDeviceName())) - return false; #endif return false; } From e5244e19fa7fd0788cdfbb8752e13cf64ade05bb Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Fri, 13 Sep 2024 21:33:16 +0000 Subject: [PATCH 16/69] temporarily adding generated kernel to src/kernel directory for testing purposes --- src/CMakeLists.txt | 4 +- src/kernels/main.cpp | 152 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 154 insertions(+), 2 deletions(-) create mode 100644 src/kernels/main.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b6e00015c9..f1d9f48958 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -62,7 +62,6 @@ if(NOT MIOPEN_GENERATOR_IS_MULTI_CONFIG) message( STATUS "CMAKE_BUILD_TYPE= ${CMAKE_BUILD_TYPE}" ) endif() -message("CK include: ${COMPOSABLE_KERNEL_INCLUDE}") # This is incremented when the ABI to the library changes set( MIOpen_SOVERSION 1.0 ) @@ -587,7 +586,8 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/xform_bidirect_winograd_data.s kernels/xform_bidirect_winograd_filter.s kernels/xform_bidirect_winograd_out.s - kernels/UniversalTranspose.cl) + kernels/UniversalTranspose.cl + kernels/main.cpp) # Kernels in development lists. # Should be ALWAYS empty in develop branch (at the time of PR merge) diff --git a/src/kernels/main.cpp b/src/kernels/main.cpp new file mode 100644 index 0000000000..cef2b8c15d --- /dev/null +++ b/src/kernels/main.cpp @@ -0,0 +1,152 @@ +#include + +struct Epilogue +{ + __host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){}; + + template + __host__ __device__ constexpr void operator()(E& e, const D& d) const; + + template <> + __host__ __device__ constexpr void operator()(ck::half_t& e, + const ck::half_t& d) const + { + e = ck::type_convert(alpha_ * e + beta_ * ck::type_convert(d)); + } + + float alpha_; + float beta_; +}; + +using CDEElementOp = Epilogue; +using DeviceConv = + ck::tensor_operation::device::CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle< + 2, + ck::tensor_layout::convolution::NHWGC, + ck::tensor_layout::convolution::GKYXC, + ck::Tuple<>, + ck::tensor_layout::convolution::NHWGK, + ck::half_t, + ck::half_t, + float, + ck::half_t, + ck::Tuple<>, + ck::half_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + CDEElementOp, + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default, + ck::tensor_operation::device::GemmSpecialization::MNKPadding, + 1, + 64, + 64, + 32, + 32, + 8, + 8, + 32, + 32, + 2, + 1, + ck::Sequence<4, 16, 1>, + ck::Sequence<1, 0, 2>, + ck::Sequence<1, 0, 2>, + 2, + 8, + 8, + 1, + ck::Sequence<4, 16, 1>, + ck::Sequence<1, 0, 2>, + ck::Sequence<1, 0, 2>, + 2, + 8, + 8, + 1, + 1, + 1, + ck::Sequence<1, 16, 1, 4>, + 1>; + +constexpr ck::index_t NumATensor = + ck::tensor_operation::device::GetNumABTensors(); +constexpr ck::index_t NumBTensor = + ck::tensor_operation::device::GetNumABTensors(); + +extern "C" __global__ void +run_64_64_32_32_8_8_32_32_2_1(const ck::half_t* in_dev, + const ck::half_t* wei_dev, + ck::half_t* __restrict__ out_dev, + ck::Array in_lengths, + ck::Array in_strides, + ck::Array wei_lengths, + ck::Array wei_strides, + ck::Array out_lengths, + ck::Array out_strides, + ck::Array conv_filter_strides, + ck::Array conv_filter_dilations, + ck::Array input_left_pads, + ck::Array input_right_pads, + const ck::tensor_operation::element_wise::PassThrough a_element_op, + const ck::tensor_operation::element_wise::PassThrough b_element_op, + const CDEElementOp cde_element_op) +{ + + auto arg = DeviceConv::Argument(in_dev, + wei_dev, + ck::Array{}, + out_dev, + in_lengths, + in_strides, + wei_lengths, + wei_strides, + ck::Array, 0>{}, + ck::Array, 0>{}, + out_lengths, + out_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + ck::tensor_operation::element_wise::PassThrough{}, + ck::tensor_operation::element_wise::PassThrough{}, + CDEElementOp{1.0f, 1.0f}); + + constexpr ck::LoopScheduler LoopSched = ck::make_default_loop_scheduler(); + + // GridwiseGemm + using GridwiseGemm = DeviceConv::GridwiseGemm; + + static constexpr auto I0 = ck::Number<0>{}; + + ck::tensor_operation::device::device_grouped_conv_fwd_multiple_abd_xdl_cshuffle< + GridwiseGemm, + const ck::half_t*, + const ck::half_t*, + typename GridwiseGemm::DsGridPointer, + ck::half_t, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + CDEElementOp, + DeviceConv::AGridDesc_AK0_M_AK1, + DeviceConv::BGridDesc_BK0_N_BK1, + DeviceConv::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock, + DeviceConv::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock, + DeviceConv::Block2ETileMap, + ck::tensor_operation::device::ComputePtrOffsetOfStridedBatch, + ck::integral_constant{}, + false, + false>(arg.p_as_grid_.At(I0), + arg.p_bs_grid_.At(I0), + arg.p_ds_grid_, + arg.p_e_grid_, + arg.a_element_op_, + arg.b_element_op_, + arg.cde_element_op_, + arg.a_g_n_c_wis_lengths_[0], // Group count + arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.e_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.block_2_etile_map_, + arg.compute_ptr_offset_of_batch_); +}; From 9bf88a092f967ba98f2a002b06f55eef3dcbecac Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Fri, 13 Sep 2024 21:33:45 +0000 Subject: [PATCH 17/69] adding compiler options --- ...hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp b/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp index 0a11ec0b61..6a1b29b96a 100644 --- a/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp +++ b/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp @@ -32,6 +32,7 @@ #include #include #include +#include #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL #include #include @@ -271,6 +272,13 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( // TODO: MIOpen has it's own handlers for compilation // auto k = rtc::compile_kernel(srcs, options); + /**auto pImpl = std::make_shared(); + pImpl->program = program_name; + pImpl->target = this->GetTargetProperties(); + auto p = HIPOCProgram{}; + p.impl = pImpl; + pImpl->BuildCodeObject(params, src);**/ + // Grid size calculation auto block_size = solution[0].GetTemplateParameter("BlockSize"); @@ -281,10 +289,12 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( kernel.l_wk = {block_size, 1, 1}; kernel.g_wk = {block_size * grid_size, 1, 1}; - /**bool bfp16parm = true; - const auto build_params = KernelBuildParameters{ - {"MIOPEN_USE_FP16", static_cast(bfp16parm)}}; - kernel.comp_options = build_params.GenerateFor(kbp::HIP{});**/ + bool bfp16parm = true; + const auto build_params = + KernelBuildParameters{{"MIOPEN_USE_FP16", static_cast(bfp16parm)}}; + kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); + kernel.comp_options += " -DCK_DONT_USE_HIP_RUNTIME_HEADERS"; + std::cout << "comp options: " << kernel.comp_options << std::endl; soln.construction_params.push_back(kernel); From 83d5d683e1b5d88072cd8df68b041a02a2cf9905 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Fri, 13 Sep 2024 21:34:13 +0000 Subject: [PATCH 18/69] updating build to include header guards due to hiprtc compilation issues --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index bf0c967e00..bed2524340 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@652a7c046381526947f507a89299aa92d89dbd02 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON +ROCm/composable_kernel@e50559862be27949cb49027a50958cc45d8af2f5 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON google/googletest@v1.14.0 From b96442b79c85c63210561639e5d8b8357f643fd9 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Fri, 13 Sep 2024 23:14:02 +0000 Subject: [PATCH 19/69] updating requirements --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index bed2524340..16b4460078 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@e50559862be27949cb49027a50958cc45d8af2f5 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON +ROCm/composable_kernel@a685822e361045f3ef02a2f60c1c0eadd9cc4c85 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON google/googletest@v1.14.0 From 972c64beaed2f9255e13260b95335329defe97be Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Fri, 13 Sep 2024 23:19:37 +0000 Subject: [PATCH 20/69] changed CK hash --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 16b4460078..bed2524340 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@a685822e361045f3ef02a2f60c1c0eadd9cc4c85 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON +ROCm/composable_kernel@e50559862be27949cb49027a50958cc45d8af2f5 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON google/googletest@v1.14.0 From 68a75c67db85ce2f598b0e674d200b3706667697 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Mon, 16 Sep 2024 08:26:04 +0000 Subject: [PATCH 21/69] update compiler args and another update for requirements.txt --- requirements.txt | 2 +- src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index bed2524340..72a0739ef8 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@e50559862be27949cb49027a50958cc45d8af2f5 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON +ROCm/composable_kernel@24e8062ab388839b6d6bbae7d525839b691e7682 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON google/googletest@v1.14.0 diff --git a/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp b/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp index 6a1b29b96a..d16b4d38e3 100644 --- a/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp +++ b/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp @@ -294,6 +294,7 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( KernelBuildParameters{{"MIOPEN_USE_FP16", static_cast(bfp16parm)}}; kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); kernel.comp_options += " -DCK_DONT_USE_HIP_RUNTIME_HEADERS"; + kernel.comp_options += " -DCK_CODE_GEN_RTC"; std::cout << "comp options: " << kernel.comp_options << std::endl; soln.construction_params.push_back(kernel); From ce43539c68b1b2f5243a6a5f7fff91021b082d2e Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Thu, 19 Sep 2024 08:12:08 +0000 Subject: [PATCH 22/69] changed some files in CK --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 72a0739ef8..e6b3ef4d47 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@24e8062ab388839b6d6bbae7d525839b691e7682 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON +ROCm/composable_kernel@e742d30ae0244b18b434caf278da27e2e5930de9 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON google/googletest@v1.14.0 From d4998f4dfc38b566983b5b45c25cbe15d5b433f9 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Tue, 24 Sep 2024 23:16:41 +0000 Subject: [PATCH 23/69] updated utility files in CK for standard header inclusion issue with hiprtc --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e6b3ef4d47..dc11306091 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@e742d30ae0244b18b434caf278da27e2e5930de9 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON +ROCm/composable_kernel@24961297a6ba6231936450994320836e6848b4d1 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON google/googletest@v1.14.0 From 9f3e84570b03e62d1b31210c7f2b96297454a8ab Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Wed, 25 Sep 2024 22:56:50 +0000 Subject: [PATCH 24/69] some more header guards --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index dc11306091..d835a6138a 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@24961297a6ba6231936450994320836e6848b4d1 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON +ROCm/composable_kernel@afff4356c87066dae1e350bd1e81379ed6b153b2 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON google/googletest@v1.14.0 From 98d36a89ca7ffbce77a0c5b8ef96c1f57d59bef1 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Thu, 26 Sep 2024 08:17:37 +0000 Subject: [PATCH 25/69] resolved some conflicts in CK utility files --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d835a6138a..174e91b222 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@afff4356c87066dae1e350bd1e81379ed6b153b2 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON +ROCm/composable_kernel@eed724f3d872e3b9fd20d3aa6dfe18106d80c346 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON google/googletest@v1.14.0 From 5c671697929f13b7552ec00582b671b52b758233 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Fri, 27 Sep 2024 08:08:39 +0000 Subject: [PATCH 26/69] resolved error in data_type file in CK --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 174e91b222..f5578c52aa 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@eed724f3d872e3b9fd20d3aa6dfe18106d80c346 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON +ROCm/composable_kernel@3624dc2a7be1159ca61c3ef22e5804d8b752a287 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON google/googletest@v1.14.0 From f86c6248b1fd9d4973f867b77d8a9fce5ef16b03 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Mon, 30 Sep 2024 08:45:29 +0000 Subject: [PATCH 27/69] resolved errors in a few CK utility files --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f5578c52aa..3ae2fb5339 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@3624dc2a7be1159ca61c3ef22e5804d8b752a287 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON +ROCm/composable_kernel@7d9969ab13adfa9555afe3db03a553622320b042 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON google/googletest@v1.14.0 From 2bf2c4c2a75a7101a59e1b00fbb49bad9654e7df Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Tue, 1 Oct 2024 07:52:42 +0000 Subject: [PATCH 28/69] added header guards/replicated functionality in CK device files --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 3ae2fb5339..5bc214c702 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@7d9969ab13adfa9555afe3db03a553622320b042 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON +ROCm/composable_kernel@54f914c628af9780ec200b015fefad581d649ba4 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON google/googletest@v1.14.0 From 35f0c3997e37f4c2a30db24e1efc968f17f5c540 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Wed, 2 Oct 2024 08:54:54 +0000 Subject: [PATCH 29/69] resolved standard header errors in gridwise gemm files and device_grouped_conv_fwd_multiple_abd files from CK --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 5bc214c702..98ced0f0b2 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@54f914c628af9780ec200b015fefad581d649ba4 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON +ROCm/composable_kernel@5de8ecfe62714821029e3c2cae51258a5c2e5ae0 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON google/googletest@v1.14.0 From 4263020eda6ee29d20768fb01f782b2b405a181f Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Wed, 2 Oct 2024 20:45:01 +0000 Subject: [PATCH 30/69] resolved error with CK's numerics file --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 98ced0f0b2..e786010ad3 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@5de8ecfe62714821029e3c2cae51258a5c2e5ae0 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON +ROCm/composable_kernel@c5cd8bab931ef44c5b1abd05dbb6eecccf20d823 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON google/googletest@v1.14.0 From b0562694c034f90a720edd0992d7db58e7a96a18 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Tue, 8 Oct 2024 16:30:41 +0000 Subject: [PATCH 31/69] resolved errors in CK's gridwise files --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e786010ad3..cb33a8c7ce 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@c5cd8bab931ef44c5b1abd05dbb6eecccf20d823 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON +ROCm/composable_kernel@c0c1a7c18d3430ed2620749f8ecae16c6bd0eac5 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON google/googletest@v1.14.0 From 53eed4adae6bebfaf3091dc038edf32f465a1e84 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Wed, 9 Oct 2024 20:00:57 +0000 Subject: [PATCH 32/69] replaces standard header functionlity in threadwise files and header guards in elementwise operations file in CK --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index cb33a8c7ce..2a193dea32 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@c0c1a7c18d3430ed2620749f8ecae16c6bd0eac5 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON +ROCm/composable_kernel@251ab61207f3ed29b6fa31b8a9594097f8abdfa9 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON google/googletest@v1.14.0 From 30b16f798d875fcfe6a24bc7240d858133975464 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Fri, 11 Oct 2024 08:27:18 +0000 Subject: [PATCH 33/69] temp fix for namespace error --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 2a193dea32..b60545d572 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@251ab61207f3ed29b6fa31b8a9594097f8abdfa9 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON +ROCm/composable_kernel@5e206700cb63bef7d5fca10dd519bc729cb2bcc8 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON google/googletest@v1.14.0 From 550eaa26fbfa1b28035e666a490170a0e4ddf5cf Mon Sep 17 00:00:00 2001 From: Astha Date: Mon, 14 Oct 2024 17:14:55 -0400 Subject: [PATCH 34/69] replaced standard header usage in elementwise files and codegen device op --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b60545d572..ba628de05d 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@5e206700cb63bef7d5fca10dd519bc729cb2bcc8 -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON +ROCm/composable_kernel@fc7a1825431e05f59e16fb80ddc5ced793b2841c -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON google/googletest@v1.14.0 From b81029ec2129abaec8fdab714faf4714efbf50fa Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Tue, 15 Oct 2024 17:38:43 +0000 Subject: [PATCH 35/69] added compiler argument to build codegen --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 26032e5701..9dcade15a5 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@fc7a1825431e05f59e16fb80ddc5ced793b2841c -DCMAKE_BUILD_TYPE=Release +ROCm/composable_kernel@fc7a1825431e05f59e16fb80ddc5ced793b2841c -DCMAKE_BUILD_TYPE=Release -DCK_USE_CODEGEN=ON google/googletest@v1.14.0 From a01c0b3d7beacf99b9d9054f6c7944957d64c811 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Tue, 15 Oct 2024 17:39:38 +0000 Subject: [PATCH 36/69] merged my CK codegen branch with develop --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 9dcade15a5..68e591007d 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@fc7a1825431e05f59e16fb80ddc5ced793b2841c -DCMAKE_BUILD_TYPE=Release -DCK_USE_CODEGEN=ON +ROCm/composable_kernel@6fcaeada9053dee028f51e70ed8204d178a3c935 -DCMAKE_BUILD_TYPE=Release -DCK_USE_CODEGEN=ON google/googletest@v1.14.0 From 18294af643999d479339208359970285be1b88ba Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Wed, 16 Oct 2024 07:56:45 +0000 Subject: [PATCH 37/69] updating codegen compiler argument to ON --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 68e591007d..ac81129f57 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@6fcaeada9053dee028f51e70ed8204d178a3c935 -DCMAKE_BUILD_TYPE=Release -DCK_USE_CODEGEN=ON +ROCm/composable_kernel@59ac05dfecb17a25e2b255a850b242cebb208729 -DCMAKE_BUILD_TYPE=Release -DCK_USE_CODEGEN=ON google/googletest@v1.14.0 From ee99eb195991c34a45e506c2fa069b8c83ddaed6 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Wed, 16 Oct 2024 19:01:21 +0000 Subject: [PATCH 38/69] temporarily removed command line compiler argument for codegen --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ac81129f57..8975b4b75a 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@59ac05dfecb17a25e2b255a850b242cebb208729 -DCMAKE_BUILD_TYPE=Release -DCK_USE_CODEGEN=ON +ROCm/composable_kernel@59ac05dfecb17a25e2b255a850b242cebb208729 -DCMAKE_BUILD_TYPE=Release google/googletest@v1.14.0 From b062ec26adbc09114ac8a1b88b6c2891f485b885 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Wed, 16 Oct 2024 23:21:32 +0000 Subject: [PATCH 39/69] temporarily removing codegen compiler flag in CK for testing --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8975b4b75a..eab6cfbd68 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@59ac05dfecb17a25e2b255a850b242cebb208729 -DCMAKE_BUILD_TYPE=Release +ROCm/composable_kernel@825d9008a68a576749a455c4953ddac7b29542aa -DCMAKE_BUILD_TYPE=Release google/googletest@v1.14.0 From c4a1ea69139ba767802bb3d6e015ebe00106afe8 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Thu, 17 Oct 2024 15:23:53 +0000 Subject: [PATCH 40/69] changed CK commit hash in Dockerfile --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 9d3ed76712..10bd17b5af 100755 --- a/Dockerfile +++ b/Dockerfile @@ -114,7 +114,7 @@ DEBIAN_FRONTEND=noninteractive apt-get purge -y --allow-unauthenticated \ miopen-hip # TODO: it should be able to automatically get commit hash from requirements.txt -ARG CK_COMMIT=467b4e502d1c2ee2c5fe85ff9fd637b04a5b7ba7 +ARG CK_COMMIT=825d9008a68a576749a455c4953ddac7b29542aa RUN wget -O ck.tar.gz https://www.github.com/ROCm/composable_kernel/archive/${CK_COMMIT}.tar.gz && \ tar zxvf ck.tar.gz &&\ cd composable_kernel-${CK_COMMIT} && \ @@ -150,4 +150,4 @@ RUN pip3 install --upgrade cmake==3.27.5 # groupadd can add one group a time RUN groupadd -f render RUN groupadd -f video -RUN usermod -a -G render,video root \ No newline at end of file +RUN usermod -a -G render,video root From 9d9483c208d835ddb922fb96d3d6387ca4b4c21d Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Fri, 18 Oct 2024 08:01:07 +0000 Subject: [PATCH 41/69] resolved error in solver files from merge with develop --- src/include/miopen/conv/solvers.hpp | 20 + src/include/miopen/solver.hpp | 5089 ----------------- ...ip_implicit_gemm_3d_grouped_fwd_xdlops.cpp | 2 +- 3 files changed, 21 insertions(+), 5090 deletions(-) diff --git a/src/include/miopen/conv/solvers.hpp b/src/include/miopen/conv/solvers.hpp index 699f267b99..a338dfa376 100644 --- a/src/include/miopen/conv/solvers.hpp +++ b/src/include/miopen/conv/solvers.hpp @@ -4482,6 +4482,26 @@ struct ConvHipImplicitGemmGroupFwdXdlops final bool CheckCKApplicability(const miopen::conv::ProblemDescription&) const; }; +struct ConvHipImplicitGemmGroupFwdXdlopsCodegen final : ConvSolver +{ + // TODO: update this fcn + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( + const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; + + bool MayNeedWorkspace() const override { return true; } + + MIOPEN_INTERNALS_EXPORT bool + IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; + + MIOPEN_INTERNALS_EXPORT ConvSolution + GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; +}; + struct PerformanceConfigHipImplicitGemm3DGroupFwdXdlops : PerfConfigBaseCK { diff --git a/src/include/miopen/solver.hpp b/src/include/miopen/solver.hpp index 5c9aad1593..69f47f6ed6 100644 --- a/src/include/miopen/solver.hpp +++ b/src/include/miopen/solver.hpp @@ -265,5095 +265,6 @@ struct IsTunable : std::is_base_of "Raw trait shouldn't be passed, explicit type is needed"); }; -namespace conv { - -/// Typedef for convolution non-tunable solvers -using ConvSolver = NonTunableSolverBase; - -/// Typedef for convolution tunable solvers -template -using ConvTunableSolver = - TunableSolverMixin; - -struct PerformanceConfigConvAsm3x3U : PerfConfigBase -{ - int limit_wave_cnt; // [0..9] - int filters_per_wave; // [1..8] - int output_lines_per_wave; // [1..8] - - MIOPEN_INTERNALS_EXPORT PerformanceConfigConvAsm3x3U(int lwc, int fpw, int olpw); - PerformanceConfigConvAsm3x3U() : PerformanceConfigConvAsm3x3U(-1, -1, -1) {} - PerformanceConfigConvAsm3x3U(bool) : PerformanceConfigConvAsm3x3U(0, 1, 1) {} - - template - static void Visit(Self&& self, F f) - { - f(self.limit_wave_cnt, "limit_wave_cnt"); - f(self.filters_per_wave, "filters_per_wave"); - f(self.output_lines_per_wave, "output_lines_per_wave"); - } - - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const - { - return IsValid(problem); - } - MIOPEN_INTERNALS_EXPORT bool IsValid(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool operator==(const PerformanceConfigConvAsm3x3U& other) const; -}; - -struct ConvAsm3x3U final : ConvTunableSolver -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConfigConvAsm3x3U GetDefaultPerformanceConfig( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigConvAsm3x3U&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConfigConvAsm3x3U - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigConvAsm3x3U&) const override; -}; - -struct PerformanceConfigConvAsm1x1U : PerfConfigBase -{ - // ----------------- // Full set Optimized Spare - // ---------------------------------------------------------------------------- - int read_size; // [1..4] - int k_mult; // 1,[4,8,12..32] 2^n[8..32] 1,4 - int chunks_per_wave; // [1..16] [1..8] - int chunk_size; // 2^n[1..64] 2^n[16..64] 1,4 - int n_mult; // [1..8] [1..4] - int c_mult; // 2^n[1..32] 2^n[1..4] - int waves_c_in_group; // [1..8] [1..4] - int waves_k_in_group; // 1,[2,4,8] 1,[2,4,8] - bool use_spare_set; - - MIOPEN_INTERNALS_EXPORT - PerformanceConfigConvAsm1x1U(int, int, int, int, int, int, int, int, bool); - PerformanceConfigConvAsm1x1U() - : PerformanceConfigConvAsm1x1U(-1, -1, -1, -1, -1, -1, -1, -1, false) - { - } - MIOPEN_INTERNALS_EXPORT PerformanceConfigConvAsm1x1U(bool spare); - - template - static void Visit(Self&& self, F f) - { - f(self.read_size, "read_size"); - f(self.k_mult, "k_mult"); - f(self.chunks_per_wave, "chunks_per_wave"); - f(self.chunk_size, "chunk_size"); - f(self.n_mult, "n_mult"); - f(self.c_mult, "c_mult"); - f(self.waves_c_in_group, "waves_c_in_group"); - f(self.waves_k_in_group, "waves_k_in_group"); - } - - // clang-format off - int GetReadSize() const { return read_size; } - int GetKMult() const { return k_mult; } - int GetChunksPerWave() const { return chunks_per_wave; } - int GetChunkSize() const { return chunk_size; } - int GetNMult() const { return n_mult; } - int GetCMult() const { return c_mult; } - int GetWavesCInGroup() const { return waves_c_in_group; } - int GetWavesKInGroup() const { return waves_k_in_group; } - int GetNPerGpr() const { assert(chunk_size); return 64 / chunk_size; } - // clang-format on - - MIOPEN_INTERNALS_EXPORT void StaticHeuristic(const miopen::conv::ProblemDescription& problem); - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const ExecutionContext&, - const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool - IsModelApplicable(const ExecutionContext& ctx, - const miopen::conv::ProblemDescription& problem) const; - bool IsValidValue() const { return IsValidValueImpl(8); } - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const - { - return IsValid(problem); - } - bool IsValid(const miopen::conv::ProblemDescription& problem) const - { - return IsValidImpl(problem, 8); - } - MIOPEN_INTERNALS_EXPORT bool operator==(const PerformanceConfigConvAsm1x1U& other) const; - -private: -#if MIOPEN_ENABLE_AI_KERNEL_TUNING - bool IsPartiallyValid(const miopen::conv::ProblemDescription& problem, - int sequence_length) const - { - return IsValidImpl(problem, sequence_length); - } - bool IsPartiallyValidValue(int sequence_length) const - { - return IsValidValueImpl(sequence_length); - } - bool RunParameterPredictionModel(const ExecutionContext&, - const miopen::conv::ProblemDescription&); - bool ModelApplyToken(int index, std::string value, const miopen::conv::ProblemDescription&); -#endif - bool IsValidImpl(const miopen::conv::ProblemDescription& problem, int sequence_length) const; - bool IsValidValueImpl(int sequence_length) const; -}; - -struct ConvAsm1x1U final : ConvTunableSolver -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - MIOPEN_INTERNALS_EXPORT PerformanceConfigConvAsm1x1U GetDefaultPerformanceConfig( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigConvAsm1x1U&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConfigConvAsm1x1U - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool MayNeedWorkspace() const override { return true; } - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigConvAsm1x1U&) const override; -}; - -struct PerformanceConfigConvAsm1x1UV2 : PerfConfigBase -{ - // ----------------- // Full set Optimized Spare - // ---------------------------------------------------------------------------- - int chunk_size; // 2^n[1..64] 2^n[16..64] - int dwords_per_ld; // [1..4] 1,2,3 - int k_mult; // [1..32] 8,16 1,2,3,4 - int c_mult; // [1..32] 2^n[1..4] - int n_mult; // [1..32] 1,2 - int w_mult; // [1..32] 1,2 - int h_mult; // [1..32] 1,2 - int h_per_chunk; // 2^n[1..64] [2,4,8] - int waves_k_in_group; // [1..8] 2,4 - int waves_c_in_group; // [1..8] 1,2 - bool use_spare_set; - - MIOPEN_INTERNALS_EXPORT - PerformanceConfigConvAsm1x1UV2(int, int, int, int, int, int, int, int, int, int, bool); - PerformanceConfigConvAsm1x1UV2() - : PerformanceConfigConvAsm1x1UV2(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, false) - { - } - MIOPEN_INTERNALS_EXPORT PerformanceConfigConvAsm1x1UV2(bool spare); - - template - static void Visit(Self&& self, F f) - { - f(self.chunk_size, "chunk_size"); - f(self.dwords_per_ld, "dwords_per_ld"); - f(self.k_mult, "k_mult"); - f(self.c_mult, "c_mult"); - f(self.n_mult, "n_mult"); - f(self.w_mult, "w_mult"); - f(self.h_mult, "h_mult"); - f(self.h_per_chunk, "h_per_chunk"); - f(self.waves_k_in_group, "waves_k_in_group"); - f(self.waves_c_in_group, "waves_c_in_group"); - } - - // clang-format off - int GetChunkSize() const { return chunk_size; } - int GetDwordsPerLd() const { return dwords_per_ld; } - int GetCMult() const { return c_mult; } - int GetKMult() const { return k_mult; } - int GetNMult() const { return n_mult; } - int GetWMult() const { return w_mult; } - int GetHMult() const { return h_mult; } - int GetHPerChunk() const { return h_per_chunk; } - int GetWavesCInGroup() const { return waves_c_in_group; } - int GetWavesKInGroup() const { return waves_k_in_group; } - int GetNPerGpr() const { assert(chunk_size); return 64 / chunk_size; } - // clang-format on - - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const - { - return IsValid(problem); - } - MIOPEN_INTERNALS_EXPORT bool IsValid(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool operator==(const PerformanceConfigConvAsm1x1UV2& other) const; -}; - -struct ConvAsm1x1UV2 final : ConvTunableSolver -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - MIOPEN_INTERNALS_EXPORT PerformanceConfigConvAsm1x1UV2 GetDefaultPerformanceConfig( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigConvAsm1x1UV2&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConfigConvAsm1x1UV2 - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigConvAsm1x1UV2&) const override; -}; - -struct ConvAsm5x10u2v2f1 final : ConvSolver -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; -}; - -struct ConvAsm5x10u2v2b1 final : ConvSolver -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; -}; - -struct ConvAsm7x7c3h224w224k64u2v2p3q3f1 final : ConvSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; -}; - -struct ConvOclDirectFwd11x11 final : ConvSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; -}; - -struct ConvOclDirectFwdGen final : ConvSolver -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; -}; - -struct PerformanceImplicitGemm : PerfConfigBase -{ - int BPerBlock; // 2^n[8..16] - int KPerBlock; // 2^n[32..128] - int EPerBlock; // 2^n[4..16] - - int GemmNRepeat; // == 2 - - int GemmMPerThreadSubC; // 2^n[2..4] - int GemmNPerThreadSubC; // 2^n[2..4] - - int GemmMLevel0Cluster; // 2^n[1..4] - int GemmNLevel0Cluster; // 2^n[1..4] - int GemmMLevel1Cluster; // 2^n[1..4] - int GemmNLevel1Cluster; // 2^n[1..4] - - int InBlockCopyClusterLengths_E; // 2^n[4..16] - int InBlockCopyClusterLengths_B; // 2^n[8..16] - int InBlockCopyClusterLengths_N1; // 2^n[1..2] - int InBlockCopyClusterLengths_N2; // 2^n[1..4] - - int WeiBlockCopyClusterLengths_E; // 2^n[1..4] - int WeiBlockCopyClusterLengths_K; // 2^n[16..128] - - bool use_spare_set; - - PerformanceImplicitGemm( - int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, bool); - - PerformanceImplicitGemm() - : PerformanceImplicitGemm( - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, false) - { - } - - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemm(bool spare); - - template - static void Visit(Self&& self, F f) - { - f(self.BPerBlock, "BPerBlock"); - f(self.KPerBlock, "KPerBlock"); - f(self.EPerBlock, "EPerBlock"); - f(self.GemmNRepeat, "GemmNRepeat"); - f(self.GemmMPerThreadSubC, "GemmMPerThreadSubC"); - f(self.GemmNPerThreadSubC, "GemmNPerThreadSubC"); - f(self.GemmMLevel0Cluster, "GemmMLevel0Cluster"); - f(self.GemmNLevel0Cluster, "GemmNLevel0Cluster"); - f(self.GemmMLevel1Cluster, "GemmMLevel1Cluster"); - f(self.GemmNLevel1Cluster, "GemmNLevel1Cluster"); - f(self.InBlockCopyClusterLengths_E, "InBlockCopyClusterLengths_E"); - f(self.InBlockCopyClusterLengths_N1, "InBlockCopyClusterLengths_N1"); - f(self.InBlockCopyClusterLengths_B, "InBlockCopyClusterLengths_B"); - f(self.InBlockCopyClusterLengths_N2, "InBlockCopyClusterLengths_N2"); - f(self.WeiBlockCopyClusterLengths_E, "WeiBlockCopyClusterLengths_E"); - f(self.WeiBlockCopyClusterLengths_K, "WeiBlockCopyClusterLengths_K"); - } - - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const ExecutionContext&, - const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValid(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool operator==(const PerformanceImplicitGemm& other) const; -}; - -struct PerformanceImplicitGemmV4R1 : public PerformanceImplicitGemm -{ - PerformanceImplicitGemmV4R1(int a, - int b, - int c, - int d, - int e, - int f, - int g, - int h, - int i, - int j, - int k, - int l, - int m, - int n, - int o, - int p, - bool q) - : PerformanceImplicitGemm(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q) - { - } - - PerformanceImplicitGemmV4R1() - : PerformanceImplicitGemmV4R1( - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, false) - { - } - - PerformanceImplicitGemmV4R1(bool spare) : PerformanceImplicitGemm(spare) {} - - MIOPEN_INTERNALS_EXPORT bool IsValid(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const; -}; - -struct PerformanceImplicitGemmV4R4Fwd : PerfConfigBase -{ - int BlockSize; - - int GemmMPerBlock; - int GemmNPerBlock; - int GemmKPerBlock; - - int GemmMPerThread; - int GemmNPerThread; - - bool use_spare_set; - - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmV4R4Fwd(int, int, int, int, int, int, bool); - - PerformanceImplicitGemmV4R4Fwd(int a, int b, int c, int d, int e, int f) - : PerformanceImplicitGemmV4R4Fwd(a, b, c, d, e, f, false) - { - } - - PerformanceImplicitGemmV4R4Fwd() : PerformanceImplicitGemmV4R4Fwd(-1, -1, -1, -1, -1, -1, false) - { - } - - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmV4R4Fwd(bool spare); - - MIOPEN_INTERNALS_EXPORT bool operator==(const PerformanceImplicitGemmV4R4Fwd& other) const; - - template - static void Visit(Self&& self, F f) - { - f(self.BlockSize, "BlockSize"); - f(self.GemmMPerBlock, "GemmMPerBlock"); - f(self.GemmNPerBlock, "GemmNPerBlock"); - f(self.GemmKPerBlock, "GemmKPerBlock"); - f(self.GemmMPerThread, "GemmMPerThread"); - f(self.GemmNPerThread, "GemmNPerThread"); - } - - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGridSize(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateBlockGemmPerformanceParameters() const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmABlockCopyPerformanceParameters() const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmBBlockCopyPerformanceParameters(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmCThreadCopyPerformanceParameters(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateLdsNumberOfByte(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const - { - return IsValid(problem); - } - MIOPEN_INTERNALS_EXPORT bool IsValid(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const ExecutionContext&, - const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); -}; - -struct PerformanceImplicitGemmV4R4WrW : PerfConfigBase -{ - int BlockSize; - - int GemmMPerBlock; - int GemmNPerBlock; - int GemmKPerBlock; - - int GemmMPerThread; - int GemmNPerThread; - - bool use_spare_set; - - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmV4R4WrW(int, int, int, int, int, int, bool); - - PerformanceImplicitGemmV4R4WrW(int a, int b, int c, int d, int e, int f) - : PerformanceImplicitGemmV4R4WrW(a, b, c, d, e, f, false) - { - } - - PerformanceImplicitGemmV4R4WrW() : PerformanceImplicitGemmV4R4WrW(-1, -1, -1, -1, -1, -1, false) - { - } - - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmV4R4WrW(bool spare); - - MIOPEN_INTERNALS_EXPORT bool operator==(const PerformanceImplicitGemmV4R4WrW& other) const; - - template - static void Visit(Self&& self, F f) - { - f(self.BlockSize, "BlockSize"); - f(self.GemmMPerBlock, "GemmMPerBlock"); - f(self.GemmNPerBlock, "GemmNPerBlock"); - f(self.GemmKPerBlock, "GemmKPerBlock"); - f(self.GemmMPerThread, "GemmMPerThread"); - f(self.GemmNPerThread, "GemmNPerThread"); - } - - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGridSize(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateBlockGemmPerformanceParameters() const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmABlockCopyPerformanceParameters() const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmBBlockCopyPerformanceParameters(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmCThreadCopyPerformanceParameters(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateLdsNumberOfByte(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const - { - return IsValid(problem); - } - MIOPEN_INTERNALS_EXPORT bool IsValid(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const ExecutionContext&, - const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); -}; - -struct PerformanceImplicitGemmBwdDataV1R1 : PerfConfigBase -{ - int BlockSize; - - int GemmMPerBlock; - int GemmNPerBlock; - int GemmKPerBlock; - - int GemmMPerThread; - int GemmNPerThread; - - bool use_spare_set; - - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmBwdDataV1R1(int, int, int, int, int, int, bool); - - PerformanceImplicitGemmBwdDataV1R1() - : PerformanceImplicitGemmBwdDataV1R1(-1, -1, -1, -1, -1, -1, false) - { - } - - PerformanceImplicitGemmBwdDataV1R1(int a, int b, int c, int d, int e, int f) - : PerformanceImplicitGemmBwdDataV1R1(a, b, c, d, e, f, false) - { - } - - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmBwdDataV1R1(bool spare); - - MIOPEN_INTERNALS_EXPORT bool operator==(const PerformanceImplicitGemmBwdDataV1R1& other) const; - - template - static void Visit(Self&& self, F f) - { - f(self.BlockSize, "BlockSize"); - f(self.GemmMPerBlock, "GemmMPerBlock"); - f(self.GemmNPerBlock, "GemmNPerBlock"); - f(self.GemmKPerBlock, "GemmKPerBlock"); - f(self.GemmMPerThread, "GemmMPerThread"); - f(self.GemmNPerThread, "GemmNPerThread"); - } - - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGridSize(const ExecutionContext&, const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateBlockGemmPerformanceParameters() const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmABlockCopyPerformanceParameters(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmBBlockCopyPerformanceParameters(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmCThreadCopyPerformanceParameters(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateLdsNumberOfByte(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - MIOPEN_INTERNALS_EXPORT bool IsValid(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const ExecutionContext&, - const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); -}; - -struct PerformanceImplicitGemmBwdDataV4R1 : PerfConfigBase -{ - int BlockSize; - - int GemmMPerBlock; - int GemmNPerBlock; - int GemmKPerBlock; - - int GemmMPerThread; - int GemmNPerThread; - - bool use_spare_set; - - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmBwdDataV4R1(int, int, int, int, int, int, bool); - - PerformanceImplicitGemmBwdDataV4R1() - : PerformanceImplicitGemmBwdDataV4R1(-1, -1, -1, -1, -1, -1, false) - { - } - - PerformanceImplicitGemmBwdDataV4R1(int a, int b, int c, int d, int e, int f) - : PerformanceImplicitGemmBwdDataV4R1(a, b, c, d, e, f, false) - { - } - - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmBwdDataV4R1(bool spare); - - MIOPEN_INTERNALS_EXPORT bool operator==(const PerformanceImplicitGemmBwdDataV4R1& other) const; - - template - static void Visit(Self&& self, F f) - { - f(self.BlockSize, "BlockSize"); - f(self.GemmMPerBlock, "GemmMPerBlock"); - f(self.GemmNPerBlock, "GemmNPerBlock"); - f(self.GemmKPerBlock, "GemmKPerBlock"); - f(self.GemmMPerThread, "GemmMPerThread"); - f(self.GemmNPerThread, "GemmNPerThread"); - } - - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGridSize(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateBlockGemmPerformanceParameters() const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmABlockCopyPerformanceParameters(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmBBlockCopyPerformanceParameters(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmCThreadCopyPerformanceParameters(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple MIOPEN_INTERNALS_EXPORT - CalculateLdsNumberOfByte(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const - { - return IsValid(problem); - } - MIOPEN_INTERNALS_EXPORT bool IsValid(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const ExecutionContext&, - const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); -}; - -struct PerformanceImplicitGemmBwdDataV4R1Xdlops - : PerfConfigBase -{ - int GemmNPerBlock; // 2^n[8..16] - int GemmMPerBlock; // 2^n[32..128] - int GemmKPerBlock; // 2^n[4..16] - - int GemmKPACKSize; // 2^[1..4] - - int GemmMPerWave; - int GemmNPerWave; - - // GemmAThreadCopyMoreGemmK is currently a fix value, is untunable - bool GemmAThreadCopyMoreGemmK; - bool GemmBThreadCopyMoreGemmKPack; - - bool use_spare_set; - MIOPEN_INTERNALS_EXPORT - PerformanceImplicitGemmBwdDataV4R1Xdlops(int, int, int, int, int, int, bool, bool, bool); - - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmBwdDataV4R1Xdlops(); - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmBwdDataV4R1Xdlops(bool spare); - PerformanceImplicitGemmBwdDataV4R1Xdlops( - int a, int b, int c, int d, int e, int f, bool g, bool h) - : PerformanceImplicitGemmBwdDataV4R1Xdlops(a, b, c, d, e, f, g, h, false) - { - } - - MIOPEN_INTERNALS_EXPORT bool - operator==(const PerformanceImplicitGemmBwdDataV4R1Xdlops& other) const; - - template - static void Visit(Self&& self, F f) - { - f(self.GemmNPerBlock, "GemmNPerBlock"); - f(self.GemmMPerBlock, "GemmMPerBlock"); - f(self.GemmKPerBlock, "GemmKPerBlock"); - f(self.GemmKPACKSize, "GemmKPACKSize"); - f(self.GemmMPerWave, "GemmMPerWave"); - f(self.GemmNPerWave, "GemmNPerWave"); - f(self.GemmAThreadCopyMoreGemmK, "GemmAThreadCopyMoreGemmK"); - f(self.GemmBThreadCopyMoreGemmKPack, "GemmBThreadCopyMoreGemmKPack"); - } - - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGridSize(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateLdsNumberOfByte(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmABlockCopyPerformanceParameters(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmBBlockCopyPerformanceParameters(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - MIOPEN_INTERNALS_EXPORT bool IsValid(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool IsReallyValid(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool - IsFastToBeUsedForTuning(const ExecutionContext&, const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const ExecutionContext&, - const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); -}; - -struct ConvHipImplicitGemmV4R1Fwd final : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmV4R1 GetDefaultPerformanceConfig( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceImplicitGemmV4R1&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceImplicitGemmV4R1&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmV4R1 - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; -}; - -struct ConvHipImplicitGemmV4R4Fwd final : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmV4R4Fwd GetDefaultPerformanceConfig( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceImplicitGemmV4R4Fwd&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmV4R4Fwd - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceImplicitGemmV4R4Fwd&) const override; - -private: - static std::tuple CalculateGemmSize(const miopen::conv::ProblemDescription&); - - friend struct PerformanceImplicitGemmV4R4Fwd; -}; - -struct PerformanceConvMlirIgemm : PerfConfigBase -{ - int BlockSize; - int GemmMPerBlock; - int GemmNPerBlock; - int GemmKPerBlock; - int GemmMPerThread; - int GemmNPerThread; - bool use_spare_set; - - /// \ref https://github.com/ROCm/MIOpen/issues/1154 - static PerformanceConvMlirIgemm& MlirHeuristicInitRequest() - { - static PerformanceConvMlirIgemm heur; - heur.SetMlirHeuristicInitRequest(); - return heur; - } - - MIOPEN_INTERNALS_EXPORT PerformanceConvMlirIgemm(int, int, int, int, int, int, bool); - - PerformanceConvMlirIgemm(int a, int b, int c, int d, int e, int f) - : PerformanceConvMlirIgemm(a, b, c, d, e, f, false) - { - } - - PerformanceConvMlirIgemm() : PerformanceConvMlirIgemm(-1, -1, -1, -1, -1, -1, false) {} - - MIOPEN_INTERNALS_EXPORT PerformanceConvMlirIgemm(bool spare); - - MIOPEN_INTERNALS_EXPORT bool operator==(const PerformanceConvMlirIgemm& other) const; - - template - static void Visit(Self&& self, F f) - { - f(self.BlockSize, "BlockSize"); - f(self.GemmMPerBlock, "GemmMPerBlock"); - f(self.GemmNPerBlock, "GemmNPerBlock"); - f(self.GemmKPerBlock, "GemmKPerBlock"); - f(self.GemmMPerThread, "GemmMPerThread"); - f(self.GemmNPerThread, "GemmNPerThread"); - } - - MIOPEN_INTERNALS_EXPORT bool IsValid(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - -private: - void SetMlirHeuristicInitRequest(); -}; - -struct ConvMlirIgemmFwd final : ConvTunableSolver -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConvMlirIgemm GetDefaultPerformanceConfig( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConvMlirIgemm&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConvMlirIgemm - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConvMlirIgemm&) const override; -}; - -struct PerformanceConvMlirIgemmXdlops : PerfConfigBase -{ - int GemmMPerBlock; // 2^n[32..128] - int GemmNPerBlock; // 2^n[8..16] - int GemmKPerBlock; // 2^n[4..16] - int GemmMPerWave; - int GemmNPerWave; - int GemmKPACKSize; // 2^[1..4] - - // GemmAThreadCopyMoreGemmK is currently a fix value, is untunable - bool GemmAThreadCopyMoreGemmK; - bool GemmBThreadCopyMoreGemmKPack; - - bool use_spare_set; - - /// \ref https://github.com/ROCm/MIOpen/issues/1154 - static PerformanceConvMlirIgemmXdlops& MlirHeuristicInitRequest() - { - static PerformanceConvMlirIgemmXdlops heur; - heur.SetMlirHeuristicInitRequest(); - return heur; - } - - MIOPEN_INTERNALS_EXPORT - PerformanceConvMlirIgemmXdlops(int, int, int, int, int, int, bool, bool, bool); - - MIOPEN_INTERNALS_EXPORT PerformanceConvMlirIgemmXdlops(); - MIOPEN_INTERNALS_EXPORT PerformanceConvMlirIgemmXdlops(bool spare); - PerformanceConvMlirIgemmXdlops(int a, int b, int c, int d, int e, int f, bool g, bool h) - : PerformanceConvMlirIgemmXdlops(a, b, c, d, e, f, g, h, false) - { - } - - MIOPEN_INTERNALS_EXPORT bool operator==(const PerformanceConvMlirIgemmXdlops& other) const; - - template - static void Visit(Self&& self, F f) - { - f(self.GemmNPerBlock, "GemmNPerBlock"); - f(self.GemmMPerBlock, "GemmMPerBlock"); - f(self.GemmKPerBlock, "GemmKPerBlock"); - f(self.GemmMPerWave, "GemmMPerWave"); - f(self.GemmNPerWave, "GemmNPerWave"); - f(self.GemmKPACKSize, "GemmKPACKSize"); - f(self.GemmAThreadCopyMoreGemmK, "GemmAThreadCopyMoreGemmK"); - f(self.GemmBThreadCopyMoreGemmKPack, "GemmBThreadCopyMoreGemmKPack"); - } - - MIOPEN_INTERNALS_EXPORT bool IsValid(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - -private: - void SetMlirHeuristicInitRequest(); -}; - -struct ConvMlirIgemmFwdXdlops final : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConvMlirIgemmXdlops GetDefaultPerformanceConfig( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConvMlirIgemmXdlops&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConvMlirIgemmXdlops - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConvMlirIgemmXdlops&) const override; -}; - -struct ConvHipImplicitGemmV4R4WrW final : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmV4R4WrW GetDefaultPerformanceConfig( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceImplicitGemmV4R4WrW&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmV4R4WrW - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceImplicitGemmV4R4WrW&) const override; - -private: - static std::tuple CalculateGemmSize(const miopen::conv::ProblemDescription&); - - friend struct PerformanceImplicitGemmV4R4WrW; -}; - -struct ConvMlirIgemmWrW final : ConvTunableSolver -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConvMlirIgemm GetDefaultPerformanceConfig( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConvMlirIgemm&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConvMlirIgemm - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConvMlirIgemm&) const override; -}; - -struct ConvMlirIgemmWrWXdlops final : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConvMlirIgemmXdlops GetDefaultPerformanceConfig( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConvMlirIgemmXdlops&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConvMlirIgemmXdlops - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool MayNeedWorkspace() const override { return true; } - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConvMlirIgemmXdlops&) const override; -}; - -struct PerformanceImplicitGemmForwardV4R4Xdlops - : PerfConfigBase -{ - int GemmMPerBlock; - int GemmNPerBlock; - int GemmKPerBlock; - int GemmMPerWave; - int GemmNPerWave; - int GemmKPack; - bool GemmAThreadCopyMoreGemmK; - bool GemmBThreadCopyMoreGemmKPack; - int GemmBThreadDataPerRead_GemmN; - - MIOPEN_INTERNALS_EXPORT - PerformanceImplicitGemmForwardV4R4Xdlops(int, int, int, int, int, int, bool, bool, int); - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmForwardV4R4Xdlops(); - PerformanceImplicitGemmForwardV4R4Xdlops(bool) : PerformanceImplicitGemmForwardV4R4Xdlops() {} - - template - static void Visit(Self&& self, F f) - { - f(self.GemmMPerBlock, "GemmMPerBlock"); - f(self.GemmNPerBlock, "GemmNPerBlock"); - f(self.GemmKPerBlock, "GemmKPerBlock"); - f(self.GemmMPerWave, "GemmMPerWave"); - f(self.GemmNPerWave, "GemmNPerWave"); - f(self.GemmKPack, "GemmKPack"); - f(self.GemmAThreadCopyMoreGemmK, "GemmAThreadCopyMoreGemmK"); - f(self.GemmBThreadCopyMoreGemmKPack, "GemmBThreadCopyMoreGemmKPack"); - f(self.GemmBThreadDataPerRead_GemmN, "GemmBThreadDataPerRead_GemmN"); - } - - MIOPEN_INTERNALS_EXPORT bool - operator==(const PerformanceImplicitGemmForwardV4R4Xdlops& other) const; - - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const ExecutionContext&, - const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - MIOPEN_INTERNALS_EXPORT bool IsValid(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool IsReallyValid(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool - IsFastToBeUsedForTuning(const ExecutionContext&, const miopen::conv::ProblemDescription&) const; - - MIOPEN_INTERNALS_EXPORT std::tuple CalculateBlockSize() const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGridSize(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmABlockCopyPerformanceParameters(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmBBlockCopyPerformanceParameters(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateLdsNumberOfByte(const miopen::conv::ProblemDescription&) const; -}; - -struct PerformanceImplicitGemmForwardV4R5Xdlops - : PerfConfigBase -{ - int GemmMPerBlock; - int GemmNPerBlock; - int GemmKPerBlock; - int GemmMPerWave; - int GemmNPerWave; - int GemmKPack; - bool GemmAThreadCopyMoreGemmK; - bool GemmBThreadCopyMoreGemmKPack; - int GemmBThreadDataPerRead_GemmN; - - bool use_spare_set; - - MIOPEN_INTERNALS_EXPORT - PerformanceImplicitGemmForwardV4R5Xdlops(int, int, int, int, int, int, bool, bool, int, bool); - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmForwardV4R5Xdlops(); - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmForwardV4R5Xdlops(bool spare); - - PerformanceImplicitGemmForwardV4R5Xdlops( - int a, int b, int c, int d, int e, int f, bool g, bool h, int i) - : PerformanceImplicitGemmForwardV4R5Xdlops(a, b, c, d, e, f, g, h, i, false) - { - } - - template - static void Visit(Self&& self, F f) - { - f(self.GemmMPerBlock, "GemmMPerBlock"); - f(self.GemmNPerBlock, "GemmNPerBlock"); - f(self.GemmKPerBlock, "GemmKPerBlock"); - f(self.GemmMPerWave, "GemmMPerWave"); - f(self.GemmNPerWave, "GemmNPerWave"); - f(self.GemmKPack, "GemmKPack"); - f(self.GemmAThreadCopyMoreGemmK, "GemmAThreadCopyMoreGemmK"); - f(self.GemmBThreadCopyMoreGemmKPack, "GemmBThreadCopyMoreGemmKPack"); - f(self.GemmBThreadDataPerRead_GemmN, "GemmBThreadDataPerRead_GemmN"); - } - - MIOPEN_INTERNALS_EXPORT bool - operator==(const PerformanceImplicitGemmForwardV4R5Xdlops& other) const; - - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const ExecutionContext&, - const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - MIOPEN_INTERNALS_EXPORT bool IsValid(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool IsReallyValid(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool - IsFastToBeUsedForTuning(const ExecutionContext&, const miopen::conv::ProblemDescription&) const; - - MIOPEN_INTERNALS_EXPORT std::tuple CalculateBlockSize() const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGridSize(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmABlockCopyPerformanceParameters(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmBBlockCopyPerformanceParameters(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateLdsNumberOfByte(const miopen::conv::ProblemDescription&) const; -}; - -struct PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm - : PerfConfigBase -{ - int GemmMPerBlock; - int GemmNPerBlock; - int GemmKPerBlock; - int GemmMPerWave; - int GemmNPerWave; - int GemmKPack; - int GemmMFactor; - int GemmNFactor; - int GemmKFactor; - bool GemmAThreadCopyMoreGemmK; - bool GemmBThreadCopyMoreGemmKPack; - int GemmBThreadDataPerRead_GemmN; - - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm( - int, int, int, int, int, int, int, int, int, bool, bool, int); - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm(); - PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm(bool) - : PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm() - { - } - - template - static void Visit(Self&& self, F f) - { - f(self.GemmMPerBlock, "GemmMPerBlock"); - f(self.GemmNPerBlock, "GemmNPerBlock"); - f(self.GemmKPerBlock, "GemmKPerBlock"); - f(self.GemmMPerWave, "GemmMPerWave"); - f(self.GemmNPerWave, "GemmNPerWave"); - f(self.GemmKPack, "GemmKPack"); - f(self.GemmMFactor, "GemmMFactor"); - f(self.GemmNFactor, "GemmNFactor"); - f(self.GemmKFactor, "GemmKFactor"); - f(self.GemmAThreadCopyMoreGemmK, "GemmAThreadCopyMoreGemmK"); - f(self.GemmBThreadCopyMoreGemmKPack, "GemmBThreadCopyMoreGemmKPack"); - f(self.GemmBThreadDataPerRead_GemmN, "GemmBThreadDataPerRead_GemmN"); - } - - MIOPEN_INTERNALS_EXPORT bool - operator==(const PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm& other) const; - - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const ExecutionContext&, - const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - MIOPEN_INTERNALS_EXPORT bool IsValid(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool IsReallyValid(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool - IsFastToBeUsedForTuning(const ExecutionContext&, const miopen::conv::ProblemDescription&) const; - - MIOPEN_INTERNALS_EXPORT std::tuple CalculateBlockSize() const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGridSize(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmABlockCopyPerformanceParameters(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmBBlockCopyPerformanceParameters(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateLdsNumberOfByte(const miopen::conv::ProblemDescription&) const; -}; - -struct PerformanceImplicitGemmBwdV1R1Xdlops : PerfConfigBase -{ - int GemmMPerBlock; - int GemmNPerBlock; - int GemmKPerBlock; - int GemmMPerWave; - int GemmNPerWave; - int GemmKPack; - bool GemmAThreadCopyMoreGemmK; - bool GemmBThreadCopyMoreGemmKPack; - - MIOPEN_INTERNALS_EXPORT - PerformanceImplicitGemmBwdV1R1Xdlops(int, int, int, int, int, int, bool, bool); - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmBwdV1R1Xdlops(); - PerformanceImplicitGemmBwdV1R1Xdlops(bool) : PerformanceImplicitGemmBwdV1R1Xdlops() {} - - template - static void Visit(Self&& self, F f) - { - f(self.GemmMPerBlock, "GemmMPerBlock"); - f(self.GemmNPerBlock, "GemmNPerBlock"); - f(self.GemmKPerBlock, "GemmKPerBlock"); - f(self.GemmMPerWave, "GemmMPerWave"); - f(self.GemmNPerWave, "GemmNPerWave"); - f(self.GemmKPack, "GemmKPack"); - f(self.GemmAThreadCopyMoreGemmK, "GemmAThreadCopyMoreGemmK"); - f(self.GemmBThreadCopyMoreGemmKPack, "GemmBThreadCopyMoreGemmKPack"); - } - - MIOPEN_INTERNALS_EXPORT bool - operator==(const PerformanceImplicitGemmBwdV1R1Xdlops& other) const; - - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const ExecutionContext&, - const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - MIOPEN_INTERNALS_EXPORT bool IsValid(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool IsReallyValid(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool - IsFastToBeUsedForTuning(const ExecutionContext&, const miopen::conv::ProblemDescription&) const; - - MIOPEN_INTERNALS_EXPORT std::tuple CalculateBlockSize() const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGridSize(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmABlockCopyPerformanceParameters(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmBBlockCopyPerformanceParameters(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateLdsNumberOfByte(const miopen::conv::ProblemDescription&) const; -}; - -struct ConvHipImplicitGemmForwardV4R4Xdlops final - : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmForwardV4R4Xdlops GetDefaultPerformanceConfig( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceImplicitGemmForwardV4R4Xdlops&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceImplicitGemmForwardV4R4Xdlops&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmForwardV4R4Xdlops - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - -private: - static std::tuple - CalculateGemmSize(const miopen::conv::ProblemDescription&); - - friend struct PerformanceImplicitGemmForwardV4R4Xdlops; -}; - -struct ConvHipImplicitGemmForwardV4R4Xdlops_Padded_Gemm final - : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm - GetDefaultPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool IsValidPerformanceConfig( - const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - -private: - static std::tuple CalculateGemmSize( - const miopen::conv::ProblemDescription&, int GemmMFactor, int GemmNFactor, int GemmKFactor); - - friend struct PerformanceImplicitGemmForwardV4R4Xdlops_Padded_Gemm; -}; - -struct ConvHipImplicitGemmForwardV4R5Xdlops final - : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmForwardV4R5Xdlops GetDefaultPerformanceConfig( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceImplicitGemmForwardV4R5Xdlops&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceImplicitGemmForwardV4R5Xdlops&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmForwardV4R5Xdlops - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; -}; - -struct ConvHipImplicitGemmV4R1WrW final : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmV4R1 GetDefaultPerformanceConfig( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceImplicitGemmV4R1&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceImplicitGemmV4R1&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmV4R1 - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; -}; - -struct ConvHipImplicitGemmBwdDataV1R1 final : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmBwdDataV1R1 GetDefaultPerformanceConfig( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceImplicitGemmBwdDataV1R1&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmBwdDataV1R1 - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceImplicitGemmBwdDataV1R1&) const override; - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool MayNeedWorkspace() const override { return true; } - -private: - static std::tuple CalculateGemmSize(const ExecutionContext&, - const miopen::conv::ProblemDescription&); - - friend struct PerformanceImplicitGemmBwdDataV1R1; -}; - -struct ConvMlirIgemmBwd final : ConvTunableSolver -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConvMlirIgemm GetDefaultPerformanceConfig( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConvMlirIgemm&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConvMlirIgemm - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConvMlirIgemm&) const override; -}; - -struct ConvMlirIgemmBwdXdlops final : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConvMlirIgemmXdlops GetDefaultPerformanceConfig( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConvMlirIgemmXdlops&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConvMlirIgemmXdlops - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConvMlirIgemmXdlops&) const override; -}; - -struct ConvHipImplicitGemmBwdDataV4R1 final : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmBwdDataV4R1 GetDefaultPerformanceConfig( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceImplicitGemmBwdDataV4R1&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmBwdDataV4R1 - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceImplicitGemmBwdDataV4R1&) const override; - -private: - static int CalculateNumberOfGemm(const miopen::conv::ProblemDescription&); - static std::tuple CalculateGemmSize(const miopen::conv::ProblemDescription&, - int gemm_id); - - friend struct PerformanceImplicitGemmBwdDataV4R1; -}; - -struct ConvHipImplicitGemmBwdDataV4R1Xdlops final - : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmBwdDataV4R1Xdlops GetDefaultPerformanceConfig( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceImplicitGemmBwdDataV4R1Xdlops&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceImplicitGemmBwdDataV4R1Xdlops&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmBwdDataV4R1Xdlops - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - -private: - static int CalculateNumberOfGemm(const miopen::conv::ProblemDescription&); - static std::tuple CalculateGemmSize(const miopen::conv::ProblemDescription&, - int gemm_id); - - friend struct PerformanceImplicitGemmBwdDataV4R1Xdlops; -}; - -struct ConvHipImplicitGemmBwdDataV1R1Xdlops final - : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmBwdV1R1Xdlops GetDefaultPerformanceConfig( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceImplicitGemmBwdV1R1Xdlops&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool MayNeedWorkspace() const override { return true; } - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmBwdV1R1Xdlops - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceImplicitGemmBwdV1R1Xdlops&) const override; - -private: - static std::tuple - CalculateGemmSize(const miopen::conv::ProblemDescription&); - - friend struct PerformanceImplicitGemmBwdV1R1Xdlops; -}; - -struct ConvAsmImplicitGemmV4R1DynamicFwd final : ConvSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - bool IsDynamic() const override { return true; } - - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; -}; - -struct ConvAsmImplicitGemmV4R1DynamicFwd_1x1 final : ConvSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - bool IsDynamic() const override { return true; } - - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; -}; - -struct ConvAsmImplicitGemmV4R1DynamicWrw final : ConvSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - bool IsDynamic() const override { return true; } - - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - bool MayNeedWorkspace() const override { return true; } - - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; -}; - -struct ConvAsmImplicitGemmGTCDynamicWrwXdlops final : ConvSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - bool IsDynamic() const override { return true; } - - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - bool MayNeedWorkspace() const override { return true; } - - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; -}; - -struct ConvAsmImplicitGemmV4R1DynamicBwd final : ConvSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - bool IsDynamic() const override { return true; } - - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; -}; - -struct ConvAsmImplicitGemmGTCDynamicFwdXdlops final : ConvSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - bool IsDynamic() const override { return true; } - - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; -}; - -struct ConvAsmImplicitGemmGTCDynamicBwdXdlops final : ConvSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - bool IsDynamic() const override { return true; } - - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; -}; - -/// Holds common member functions for the Solvers which share the same -/// "legacy exhaustive search" machinery. -struct ConvOclDirectFwdLegacyExhaustiveSearch : ConvTunableSolver -{ - MIOPEN_INTERNALS_EXPORT LegacyPerformanceConfig GetDefaultPerformanceConfig( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT LegacyPerformanceConfig - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - -private: - template - LegacyPerformanceConfig SearchImpl(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const; -}; - -struct ConvOclDirectFwd : ConvOclDirectFwdLegacyExhaustiveSearch -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - MIOPEN_INTERNALS_EXPORT static ConvSolution - BaseGetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const LegacyPerformanceConfig&); - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const LegacyPerformanceConfig&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const LegacyPerformanceConfig&) const override; -}; - -struct ConvOclDirectFwd1x1 final : ConvOclDirectFwdLegacyExhaustiveSearch -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const LegacyPerformanceConfig&) const override; - - bool IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const LegacyPerformanceConfig&) const override - { - return true; - } -}; - -struct ConvBinWinograd3x3U final : ConvSolver -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - bool IsDynamic() const override { return true; } - - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; -}; - -struct ConvBinWinogradRxS final : ConvSolver -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - bool IsDynamic() const override { return true; } - - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; -}; - -struct PerformanceConfigConvBinWinogradRxS : PerfConfigBase -{ - int n_groups; - MIOPEN_INTERNALS_EXPORT PerformanceConfigConvBinWinogradRxS(int n_groups_); - PerformanceConfigConvBinWinogradRxS() : PerformanceConfigConvBinWinogradRxS(-1) {} - PerformanceConfigConvBinWinogradRxS(bool) : PerformanceConfigConvBinWinogradRxS(1) {} - - template - static void Visit(Self&& self, F f) - { - f(self.n_groups, "n_groups"); - } - int GetNGroups() const { return n_groups; } - - template - void HeuristicInit(const ExecutionContext&, const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - bool IsValid(const ExecutionContext& ctx, const miopen::conv::ProblemDescription&) const - { - return IsValid(ctx); - } - MIOPEN_INTERNALS_EXPORT bool IsValid(const ExecutionContext&) const; - MIOPEN_INTERNALS_EXPORT bool operator==(const PerformanceConfigConvBinWinogradRxS& other) const; -}; - -template -struct ConvBinWinoRxS final : ConvTunableSolver -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - static const std::string& GetSolverDbId() - { - static const std::string dbId = std::string("ConvBinWinogradRxSf") - .append(std::to_string(Winodata)) - .append("x") - .append(std::to_string(Winofilter)); - return dbId; - } - - MIOPEN_INTERNALS_EXPORT PerformanceConfigConvBinWinogradRxS GetDefaultPerformanceConfig( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigConvBinWinogradRxS&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConfigConvBinWinogradRxS - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool IsDynamic() const override { return true; } - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigConvBinWinogradRxS&) const override; - -private: - static size_t GetNGroups(const size_t group_conv, const size_t grid_group_size) - { - assert(group_conv != 0); - return grid_group_size / group_conv; - } -}; - -// Suppress misleading clang warnings -#if defined(__clang__) -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wweak-template-vtables" -#endif - -extern template struct ConvBinWinoRxS<2, 3>; -extern template struct ConvBinWinoRxS<3, 2>; - -#if defined(__clang__) -#pragma clang diagnostic pop -#endif - -struct ConvBinWinogradRxSf2x3g1 final : ConvSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool IsDynamic() const override { return true; } - MIOPEN_INTERNALS_EXPORT float GetWti(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; -}; - -template -struct ConvMPBidirectWinograd final : ConvSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId< - ConvMPBidirectWinograd>(); - } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool IsDynamic() const override { return true; } - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool MayNeedWorkspace() const override { return true; } - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - // kernel_file_name for solver identification - static fs::path GetSolverFileNames(int id) - { - static const fs::path names[3] = {"xform_bidirect_winograd_data.s", - "xform_bidirect_winograd_filter.s", - "xform_bidirect_winograd_out.s"}; - return names[id]; - } - - static std::string GetSolverKernelNames(int id) - { - static const std::string name_suffix = - '_' + std::to_string(WinoDataH) + '_' + std::to_string(WinoDataW) + '_' + - std::to_string(WinoFilterH) + '_' + std::to_string(WinoFilterW); - static const std::string names[3] = { - "miopenGcnAsmMPBidirectWinogradXformData" + name_suffix, - "miopenGcnAsmMPBidirectWinogradXformFilter" + name_suffix, - "miopenGcnAsmMPBidirectWinogradXformOut" + name_suffix}; - return names[id]; - } - - static int GetSolverWinoXformHWSize() { return WinoDataH + WinoFilterH - 1; } -}; - -// To suppress misleading clang warnings -#if defined(__clang__) && defined(CONV_MP_BIDIRECTIONAL_WINOGRAD_CPP) -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wweak-template-vtables" -#endif - -extern template struct ConvMPBidirectWinograd<2, 3>; -extern template struct ConvMPBidirectWinograd<3, 3>; -extern template struct ConvMPBidirectWinograd<4, 3>; -extern template struct ConvMPBidirectWinograd<5, 3>; -extern template struct ConvMPBidirectWinograd<6, 3>; - -#if defined(__clang__) && defined(CONV_MP_BIDIRECTIONAL_WINOGRAD_CPP) -#pragma clang diagnostic pop -#endif - -template -struct ConvMPBidirectWinograd_xdlops final - : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId< - ConvMPBidirectWinograd_xdlops>(); - } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - bool IsDynamic() const override - { - return ConvHipImplicitGemmForwardV4R4Xdlops{}.IsDynamic() && - ConvMPBidirectWinograd{} - .IsDynamic() && - IsThisSolverDynamic(); - } - - PerformanceImplicitGemmForwardV4R4Xdlops - GetDefaultPerformanceConfig(const ExecutionContext& ctx, - const miopen::conv::ProblemDescription& problem) const override - { - const auto xdlops_problem = GetTransformedProblem(problem); - const auto xdlops_ctx = GetTransformedConvContext(ctx, xdlops_problem); - - return ConvHipImplicitGemmForwardV4R4Xdlops{}.GetDefaultPerformanceConfig(xdlops_ctx, - xdlops_problem); - } - - bool - IsValidPerformanceConfig(const ExecutionContext& ctx, - const miopen::conv::ProblemDescription& problem, - const PerformanceImplicitGemmForwardV4R4Xdlops& config) const override - { - const auto xdlops_problem = GetTransformedProblem(problem); - const auto xdlops_ctx = GetTransformedConvContext(ctx, xdlops_problem); - - return ConvHipImplicitGemmForwardV4R4Xdlops{}.IsValidPerformanceConfig( - xdlops_ctx, xdlops_problem, config); - } - - size_t GetWorkspaceSize(const ExecutionContext& ctx, - const miopen::conv::ProblemDescription& problem) const override - { - const auto xdlops_problem = GetTransformedProblem(problem); - const auto xdlops_ctx = GetTransformedConvContext(ctx, xdlops_problem); - - return ConvMPBidirectWinograd() - .GetWorkspaceSize(ctx, problem) + - ConvHipImplicitGemmForwardV4R4Xdlops{}.GetWorkspaceSize(xdlops_ctx, xdlops_problem); - } - - bool MayNeedWorkspace() const override { return true; } - - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmForwardV4R4Xdlops - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceImplicitGemmForwardV4R4Xdlops&) const override; - -private: - ExecutionContext - GetTransformedConvContext(const ExecutionContext& ctx, - const miopen::conv::ProblemDescription& transformed_problem) const; - miopen::conv::ProblemDescription - GetTransformedProblem(const miopen::conv::ProblemDescription& problem) const; - - // kernel_file_name for solver identification - static fs::path GetSolverFileNames(int id) - { - return ConvMPBidirectWinograd:: - GetSolverFileNames(id); - } - - static std::string GetSolverKernelNames(int id) - { - return ConvMPBidirectWinograd:: - GetSolverKernelNames(id); - } - - static int GetSolverWinoXformHWSize() - { - return ConvMPBidirectWinograd:: - GetSolverWinoXformHWSize(); - } - - bool IsThisSolverDynamic() const { return true; } -}; - -// To suppress misleading clang warnings -#if defined(__clang__) && defined(CONV_MP_BIDIRECTIONAL_WINOGRAD_CPP) -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wweak-template-vtables" -#endif - -extern template struct ConvMPBidirectWinograd_xdlops<2, 3>; -extern template struct ConvMPBidirectWinograd_xdlops<3, 3>; -extern template struct ConvMPBidirectWinograd_xdlops<4, 3>; -extern template struct ConvMPBidirectWinograd_xdlops<5, 3>; -extern template struct ConvMPBidirectWinograd_xdlops<6, 3>; - -#if defined(__clang__) && defined(CONV_MP_BIDIRECTIONAL_WINOGRAD_CPP) -#pragma clang diagnostic pop -#endif - -template -struct ConvWinograd3x3MultipassWrW final : ConvSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId< - ConvWinograd3x3MultipassWrW>(); - } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - bool IsDynamic() const override { return true; } - - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - bool MayNeedWorkspace() const override { return true; } - - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - // kernel_file_name for solver identification - static fs::path GetSolverFileNames(int id) - { - static const fs::path names[3] = {"xform_data.s", "xform_filter.s", "xform_out.s"}; - return names[id]; - } - - static std::string GetSolverKernelNames(int id) - { - static const std::string name_suffix = - '_' + std::to_string(WinoDataH) + '_' + std::to_string(WinoDataW) + '_' + - std::to_string(WinoFilterH) + '_' + std::to_string(WinoFilterW); - static const std::string names[3] = {"miopenGcnAsmWinogradXformData" + name_suffix, - "miopenGcnAsmWinogradXformFilter" + name_suffix, - "miopenGcnAsmWinogradXformOut" + name_suffix}; - - return names[id]; - } - - static int GetGroupCountMult() { return 4; } - - static int GetSolverWinoXformHWSize(const miopen::conv::ProblemDescription& problem, int id) - { - if(id == 0) - { - return WinoDataH + - (WinoFilterH - 1) * (WinoDataH == 7 ? 2 : problem.GetKernelStrideH()); - } - else - { - return WinoDataW + - (WinoFilterW - 1) * (WinoDataW == 7 ? 2 : problem.GetKernelStrideW()); - } - } - -private: - InvokerFactory PrepareInvokerFactory(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - std::size_t ws_sz) const; -}; - -// To suppress misleading clang warnings -#if defined(__clang__) && defined(CONV_MULTIPASS_WINO3X3WRW_CPP) -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wweak-template-vtables" -#endif - -extern template struct ConvWinograd3x3MultipassWrW<3, 2>; -extern template struct ConvWinograd3x3MultipassWrW<3, 3>; -extern template struct ConvWinograd3x3MultipassWrW<3, 4>; -extern template struct ConvWinograd3x3MultipassWrW<3, 5>; -extern template struct ConvWinograd3x3MultipassWrW<3, 6>; -extern template struct ConvWinograd3x3MultipassWrW<7, 2>; -extern template struct ConvWinograd3x3MultipassWrW<7, 3>; -extern template struct ConvWinograd3x3MultipassWrW<1, 1, 7, 2>; -extern template struct ConvWinograd3x3MultipassWrW<1, 1, 7, 3>; -extern template struct ConvWinograd3x3MultipassWrW<7, 2, 1, 1>; -extern template struct ConvWinograd3x3MultipassWrW<7, 3, 1, 1>; -extern template struct ConvWinograd3x3MultipassWrW<5, 3>; -extern template struct ConvWinograd3x3MultipassWrW<5, 4>; - -#if defined(__clang__) && defined(CONV_MULTIPASS_WINO3X3WRW_CPP) -#pragma clang diagnostic pop -#endif - -struct PerformanceConfigAsmDirect3x3WrW : PerfConfigBase -{ - int limit_wave_cnt; // [0..9] - int reverse_inout; // [0..1], 1 is allowed for stride=1x1 only. - int chunk_size; // {16,8}, Smaller values increase register pressure. - int k_per_wave; // {1,2,4,8} && ((chunk_size * k_per_wave) <= 64). - // Higher values increase register pressure. - int pipe_lines_depth; // [1..16] && (pipe_lines_depth <= img_h). - // Higher values increase register pressure. - int n_per_group; // [1..8] && (n_per_group <= batch_size). - - PerformanceConfigAsmDirect3x3WrW(int lwc, int rio, int csz, int kpw, int pld, int npg); - PerformanceConfigAsmDirect3x3WrW() : PerformanceConfigAsmDirect3x3WrW(-1, -1, -1, -1, -1, -1) {} - PerformanceConfigAsmDirect3x3WrW(bool) : PerformanceConfigAsmDirect3x3WrW(0, 0, 8, 1, 1, 1) {} - - template - static void Visit(Self&& self, F f) - { - f(self.limit_wave_cnt, "limit_wave_cnt"); - f(self.reverse_inout, "reverse_inout"); - f(self.chunk_size, "chunk_size"); - f(self.k_per_wave, "k_per_wave"); - f(self.pipe_lines_depth, "pipe_lines_depth"); - f(self.n_per_group, "n_per_group"); - } - - // clang-format off - int GetLimitWaveCnt() const { return limit_wave_cnt; } - int GetReverseInout() const { return reverse_inout; } - int GetChunkSize() const { return chunk_size; } - int GetKPerWave() const { return k_per_wave; } - int GetPipeLinesDepth() const { return pipe_lines_depth; } - int GetNPerGroup() const { return n_per_group; } - int GetCPerWave() const { assert(chunk_size); return 64 / chunk_size; } // clang-format on - - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const ExecutionContext&, - const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValid(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool operator==(const PerformanceConfigAsmDirect3x3WrW& other) const; -}; - -struct ConvAsmBwdWrW3x3 final : ConvTunableSolver -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - MIOPEN_INTERNALS_EXPORT PerformanceConfigAsmDirect3x3WrW GetDefaultPerformanceConfig( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigAsmDirect3x3WrW&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConfigAsmDirect3x3WrW - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigAsmDirect3x3WrW& config) const override; -}; - -template -struct ConvWinoFuryRxS final : ConvSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId>(); - } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool IsDynamic() const override { return true; } - MIOPEN_INTERNALS_EXPORT float GetWti(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool MayNeedWorkspace() const override { return true; } - - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; -}; - -// Suppress misleading clang warnings -#if defined(__clang__) -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wweak-template-vtables" -#endif - -extern template struct ConvWinoFuryRxS<2, 3>; -// extern template struct ConvWinoFuryRxS<3, 2>; - -#if defined(__clang__) -#pragma clang diagnostic pop -#endif - -struct PerformanceConfigConvAsmBwdWrW1x1 : PerfConfigBase -{ - - int chunk_size; // {1,2,4,8,16} - int c_per_gpr; // {1,2,4,8,16} - int c_mult; // {1,2,4,8,16} - int k_per_gpr; // {1,2,4,8,16} - int k_mult; // {1,2,4,8,16} - int n_per_gpr; // {1,2,4} - int n_part_cnt; // [1..8] - int read_size; // [1..4] - int short_store; // {0,1} - int data_prefetch; // [0..4] - bool use_spare_set; - - /// The following conditions must be met. - /// - /// Shader design-related constraints: - /// - (A) (chunk_size * c_per_gpr) == 16 - /// - (B) k_per_gpr <= c_per_gpr - /// - (C) (c_mult > 1 || k_mult > 1) - /// ? ((fwd_C % (c_per_gpr * c_mult) == 0) && (fwd_K % (k_per_gpr * k_mult) == 0)) - /// : (true) - /// - /// Resource-related constraints: - /// - (D) c_mult * k_mult * k_per_gpr + 9 + (c_mult + k_mult) * read_size * pipe_depth <= 256 - /// - /// Where: - /// - fwd_C := Num input channels for forward convolution (-c). - /// For backward, this is actually n_outputs. - /// - fwd_K := Num output channels for forward convolution (-k). - /// For backward, this is actually n_inputs. - - MIOPEN_INTERNALS_EXPORT PerformanceConfigConvAsmBwdWrW1x1(int chunk_size_, - int c_per_gpr_, - int c_mult_, - int k_per_gpr_, - int k_mult_, - int n_per_gpr_, - int n_part_cnt_, - int read_size_, - int short_store_, - int data_prefetch_, - bool); - PerformanceConfigConvAsmBwdWrW1x1() - : PerformanceConfigConvAsmBwdWrW1x1(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, false) - { - } - PerformanceConfigConvAsmBwdWrW1x1(bool spare) - : PerformanceConfigConvAsmBwdWrW1x1(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, spare) - { - } - - template - static void Visit(Self&& self, F f) - { - f(self.chunk_size, "chunk_size"); - f(self.c_per_gpr, "c_per_gpr"); - f(self.c_mult, "c_mult"); - f(self.k_per_gpr, "k_per_gpr"); - f(self.k_mult, "k_mult"); - f(self.n_per_gpr, "n_per_gpr"); - f(self.n_part_cnt, "n_part_cnt"); - f(self.read_size, "read_size"); - f(self.short_store, "short_store"); - f(self.data_prefetch, "data_prefetch"); - } - - // clang-format off - int GetChunkSize() const { return chunk_size; } - int GetCPerGpr() const { return c_per_gpr; } - int GetCMult() const { return c_mult; } - int GetKPerGpr() const { return k_per_gpr; } - int GetKMult() const { return k_mult; } - int GetNPerGpr() const { return n_per_gpr; } - int GetNPartCnt() const { return n_part_cnt; } - int GetHWPerGpr() const { assert(c_per_gpr); assert(n_per_gpr); assert(chunk_size); - return wave_size / (c_per_gpr * n_per_gpr * chunk_size); } // "hw" stands for "height-and-width". - int GetReadSize() const { return read_size; } - int GetShortStore() const {return short_store; } - int GetDataPrefetch() const { return data_prefetch; } - // clang-format on - - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const ExecutionContext&, - const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValid(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool operator==(const PerformanceConfigConvAsmBwdWrW1x1& other) const; -}; - -struct ConvAsmBwdWrW1x1 final : ConvTunableSolver -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - MIOPEN_INTERNALS_EXPORT PerformanceConfigConvAsmBwdWrW1x1 MIOPEN_INTERNALS_EXPORT - GetDefaultPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigConvAsmBwdWrW1x1&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConfigConvAsmBwdWrW1x1 - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool MayNeedWorkspace() const override { return true; } - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigConvAsmBwdWrW1x1&) const override; -}; - -/// N_BATCH_LOOPS - {1,2,4,8,16} Num batches processed in single workitem. -/// Required workspace size depends on it. However there is a restriction in the internal -/// Solver API that this shouldn't be so. Therefore the family of Solvers created. -/// Each Solver in the family has constant value of this parameter. -template -struct PerformanceConfigConvOclBwdWrw2 - : PerfConfigBase> -{ - // Num waves involved a workgroup. - int n_waves = -1; // {1,2,4,8} - // Num values to read in a workitem (read_unit). - int read_size = -1; // [6..12] - // Num of output channels (top/bottom layer in forward/backward direction) - // that share the same input channel in single workgroup. - // Also represents number of output channels in single tile. - int n_out_channels_per_tile = -1; // {1,2,4,8} - // How many tiles of output channels are processed in a single workgroup? - // n_out_channels_in_lcl * n_out_channels_tiles = total number of - // output channels processed in single workgroup. - int n_out_channels_tiles = -1; // {1,2,4,8} - // Num of output rows processed in a single iteration of loop in a workitem - // (N_ALIGNED_OUT_SCAN_BLK). - int n_out_rows_in_lcl = -1; // [2..11] - - PerformanceConfigConvOclBwdWrw2(int nw, int rs, int nocpt, int noct, int noril) - : n_waves(nw), - read_size(rs), - n_out_channels_per_tile(nocpt), - n_out_channels_tiles(noct), - n_out_rows_in_lcl(noril) - { - } - PerformanceConfigConvOclBwdWrw2() {} - PerformanceConfigConvOclBwdWrw2(bool) : PerformanceConfigConvOclBwdWrw2(1, 6, 1, 1, 2) {} - // spare_set is not used in this solver. - - template - static void Visit(Self&& self, F f) - { - f(self.n_waves, "n_waves"); - f(self.read_size, "read_size"); - f(self.n_out_channels_per_tile, "n_out_channels_per_tile"); - f(self.n_out_channels_tiles, "n_out_channels_tiles"); - f(self.n_out_rows_in_lcl, "n_out_rows_in_lcl"); - } - - // clang-format off - int GetNumWaves() const { return n_waves; } - int GetReadSize() const { return read_size; } - int GetNumOutChannelsPerTile() const { return n_out_channels_per_tile; } - int GetNumOutChannelTiles() const { return n_out_channels_tiles; } - int GetNumOutRowsPerIterPerWork() const { return n_out_rows_in_lcl; } // clang-format on - - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValid(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool - operator==(const PerformanceConfigConvOclBwdWrw2& other) const; -}; - -template -struct ConvOclBwdWrW2 : ConvTunableSolver> -{ - const std::string& SolverDbId() const override - { - return this->template GetSolverDbId>(); - } - - MIOPEN_INTERNALS_EXPORT PerformanceConfigConvOclBwdWrw2 - GetDefaultPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigConvOclBwdWrw2&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConfigConvOclBwdWrw2 - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool MayNeedWorkspace() const override { return true; } - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigConvOclBwdWrw2&) const override; - -protected: - bool IsApplicableBase(const ExecutionContext&, const miopen::conv::ProblemDescription&) const; -}; - -// To suppress misleading clang warnings -#if defined(__clang__) && defined(CONV_OCL_DIR2D_BWDWRW_2_CPP) -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wweak-template-vtables" -#endif - -extern template struct PerformanceConfigConvOclBwdWrw2<1>; -extern template struct PerformanceConfigConvOclBwdWrw2<2>; -extern template struct PerformanceConfigConvOclBwdWrw2<4>; -extern template struct PerformanceConfigConvOclBwdWrw2<8>; -extern template struct PerformanceConfigConvOclBwdWrw2<16>; - -extern template struct ConvOclBwdWrW2<1>; -extern template struct ConvOclBwdWrW2<2>; -extern template struct ConvOclBwdWrW2<4>; -extern template struct ConvOclBwdWrW2<8>; -extern template struct ConvOclBwdWrW2<16>; - -#if defined(__clang__) && defined(CONV_OCL_DIR2D_BWDWRW_2_CPP) -#pragma clang diagnostic pop -#endif - -/// A separate solver from ConvOclBwdWrW2 to disable auto-tuning for certain configs. -/// Basically, this is *hack* for non-group 3x3 and 1x1 cases. -/// It is assumed that Solutions provided by the ConvOclBwdWrW2 solver -/// would never beat 3x3 and 1x1 assembly WrW kernels, even after tuning. -struct ConvOclBwdWrW2NonTunable final : ConvOclBwdWrW2<1> -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const; - InvokerFactory GetInvokerFactory(const ExecutionContext& ctx, - const miopen::conv::ProblemDescription& problem) const - { - return *GetSolution(ctx, problem).invoker_factory; - } - -private: - // This function dervied from ConvOclBwdWrW2 is declared private - // so that this solver is not marked searchable/tunable. - using ConvOclBwdWrW2<1>::GetDefaultPerformanceConfig; - using ConvOclBwdWrW2<1>::GetSolution; - using ConvOclBwdWrW2<1>::GetInvokerFactory; -}; - -struct ConvOclBwdWrW53 final : ConvSolver -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool MayNeedWorkspace() const override { return true; } - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; -}; - -struct ConvOclBwdWrW1x1 final : ConvSolver -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool MayNeedWorkspace() const override { return true; } - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; -}; - -struct fft final : ConvSolver -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - bool MayNeedWorkspace() const override { return true; } - - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; -}; - -struct PerformanceImplicitGemmWrwV4R4Xdlops : PerfConfigBase -{ - int GemmMPerBlock; - int GemmNPerBlock; - int GemmKPerBlock; - int GemmMPerWave; - int GemmNPerWave; - int GemmKPack; - bool GemmAThreadCopyMoreGemmK; - bool GemmBThreadCopyMoreGemmK; - bool use_spare_set; - - MIOPEN_INTERNALS_EXPORT - PerformanceImplicitGemmWrwV4R4Xdlops(int, int, int, int, int, int, bool, bool, bool); - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmWrwV4R4Xdlops(); - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmWrwV4R4Xdlops(bool spare); - PerformanceImplicitGemmWrwV4R4Xdlops(int a, int b, int c, int d, int e, int f, bool g, bool h) - : PerformanceImplicitGemmWrwV4R4Xdlops(a, b, c, d, e, f, g, h, false) - { - } - - template - static void Visit(Self&& self, F f) - { - f(self.GemmMPerBlock, "GemmMPerBlock"); - f(self.GemmNPerBlock, "GemmNPerBlock"); - f(self.GemmKPerBlock, "GemmKPerBlock"); - f(self.GemmMPerWave, "GemmMPerWave"); - f(self.GemmNPerWave, "GemmNPerWave"); - f(self.GemmKPack, "GemmKPack"); - f(self.GemmAThreadCopyMoreGemmK, "GemmAThreadCopyMoreGemmK"); - f(self.GemmBThreadCopyMoreGemmK, "GemmBThreadCopyMoreGemmK"); - } - - MIOPEN_INTERNALS_EXPORT bool - operator==(const PerformanceImplicitGemmWrwV4R4Xdlops& other) const; - - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const ExecutionContext&, - const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - MIOPEN_INTERNALS_EXPORT bool IsValid(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool IsReallyValid(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool - IsFastToBeUsedForTuning(const ExecutionContext&, const miopen::conv::ProblemDescription&) const; - - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmSizeAndGemmKBlock(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple CalculateBlockSize() const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGridSize(const ExecutionContext&, const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmABlockCopyPerformanceParameters(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmBBlockCopyPerformanceParameters(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateLdsNumberOfByte(const miopen::conv::ProblemDescription&) const; -}; - -struct ConvHipImplicitGemmWrwV4R4Xdlops final - : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmWrwV4R4Xdlops GetDefaultPerformanceConfig( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool MayNeedWorkspace() const override { return true; } - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceImplicitGemmWrwV4R4Xdlops&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceImplicitGemmWrwV4R4Xdlops&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmWrwV4R4Xdlops - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; -}; - -struct PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm - : PerfConfigBase -{ - int GemmMPerBlock; - int GemmNPerBlock; - int GemmKPerBlock; - int GemmMPerWave; - int GemmNPerWave; - int GemmKPack; - int GemmMFactor; - int GemmNFactor; - int GemmKTotalFactor; - bool GemmAThreadCopyMoreGemmK; - bool GemmBThreadCopyMoreGemmK; - - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm( - int, int, int, int, int, int, int, int, int, bool, bool); - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm(); - PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm(bool) - : PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm() - { - } - - template - static void Visit(Self&& self, F f) - { - f(self.GemmMPerBlock, "GemmMPerBlock"); - f(self.GemmNPerBlock, "GemmNPerBlock"); - f(self.GemmKPerBlock, "GemmKPerBlock"); - f(self.GemmMPerWave, "GemmMPerWave"); - f(self.GemmNPerWave, "GemmNPerWave"); - f(self.GemmKPack, "GemmKPack"); - f(self.GemmMFactor, "GemmMFactor"); - f(self.GemmNFactor, "GemmNFactor"); - f(self.GemmKTotalFactor, "GemmKTotalFactor"); - f(self.GemmAThreadCopyMoreGemmK, "GemmAThreadCopyMoreGemmK"); - f(self.GemmBThreadCopyMoreGemmK, "GemmBThreadCopyMoreGemmK"); - } - - MIOPEN_INTERNALS_EXPORT bool - operator==(const PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm& other) const; - - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const ExecutionContext&, - const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - MIOPEN_INTERNALS_EXPORT bool IsValid(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool IsReallyValid(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool - IsFastToBeUsedForTuning(const ExecutionContext&, const miopen::conv::ProblemDescription&) const; - - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmSizeAndGemmKBlock(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple CalculateBlockSize() const; - std::tuple CalculateGridSize(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmABlockCopyPerformanceParameters(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateGemmBBlockCopyPerformanceParameters(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT std::tuple - CalculateLdsNumberOfByte(const miopen::conv::ProblemDescription&) const; -}; - -struct ConvHipImplicitGemmWrwV4R4Xdlops_Padded_Gemm final - : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm - GetDefaultPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool MayNeedWorkspace() const override { return true; } - MIOPEN_INTERNALS_EXPORT bool IsValidPerformanceConfig( - const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceImplicitGemmWrwV4R4Xdlops_Padded_Gemm - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; -}; - -struct PerformanceConvCkIgemmFwdV6r1DlopsNchw - : PerfConfigBase -{ - int ck_tunable_list_id; - - PerformanceConvCkIgemmFwdV6r1DlopsNchw(int a) : ck_tunable_list_id(a) {} - - PerformanceConvCkIgemmFwdV6r1DlopsNchw() : PerformanceConvCkIgemmFwdV6r1DlopsNchw(-1) {} - - PerformanceConvCkIgemmFwdV6r1DlopsNchw(bool) : PerformanceConvCkIgemmFwdV6r1DlopsNchw(0) {} - - template - static void Visit(Self&& self, F f) - { - f(self.ck_tunable_list_id, "ck_tunable_list_id"); - } - - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const - { - return IsValid(problem); - } - MIOPEN_INTERNALS_EXPORT bool IsValid(const miopen::conv::ProblemDescription&) const; - bool operator==(const PerformanceConvCkIgemmFwdV6r1DlopsNchw& config) const - { - return ck_tunable_list_id == config.ck_tunable_list_id; - } -}; - -struct ConvCkIgemmFwdV6r1DlopsNchw final : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool MayNeedWorkspace() const override { return true; } - bool IsDynamic() const override { return false; } - MIOPEN_INTERNALS_EXPORT PerformanceConvCkIgemmFwdV6r1DlopsNchw GetDefaultPerformanceConfig( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConvCkIgemmFwdV6r1DlopsNchw&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConvCkIgemmFwdV6r1DlopsNchw - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConvCkIgemmFwdV6r1DlopsNchw&) const override; -}; - -struct ConvDirectNaiveConvFwd final : ConvSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool IsDynamic() const override { return true; } - /// Use very small fixed value enough to backup GEMM for cases when - /// GEMM is disabled. - float GetWti(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override - { - return 0.01f; - } - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; -}; - -struct ConvDirectNaiveConvBwd final : ConvSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool IsDynamic() const override { return true; } - /// Use very small fixed value enough to backup GEMM for cases when - /// GEMM is disabled. - float GetWti(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override - { - return 0.01f; - } - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; -}; - -struct ConvDirectNaiveConvWrw final : ConvSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool IsDynamic() const override { return true; } - /// Use very small fixed value enough to backup GEMM for cases when - /// GEMM is disabled. - float GetWti(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override - { - return 0.01f; - } - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; -}; - -struct GemmFwdBase : ConvSolver -{ - bool IsDynamic() const override { return true; } - MIOPEN_INTERNALS_EXPORT float GetWti(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const override; - -private: - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - friend struct GemmFwd1x1_0_2; - friend struct GemmFwd1x1_0_1_int8; - friend struct GemmFwd1x1_0_1; - friend struct GemmFwdRest; -}; - -struct GemmFwd1x1_0_2 final : GemmFwdBase -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - bool MayNeedWorkspace() const override { return true; } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - friend struct GemmFwdRest; -}; - -struct GemmFwd1x1_0_1_int8 final : GemmFwdBase -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - bool MayNeedWorkspace() const override { return true; } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - friend struct GemmFwdRest; -}; - -struct GemmFwd1x1_0_1 final : GemmFwdBase -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - bool MayNeedWorkspace() const override { return true; } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - friend struct GemmFwdRest; -}; - -struct GemmFwdRest final : GemmFwdBase -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - bool MayNeedWorkspace() const override { return true; } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; -}; - -struct GemmBwdBase : ConvSolver -{ - bool IsDynamic() const override { return true; } - MIOPEN_INTERNALS_EXPORT float GetWti(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const override; - -private: - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - friend struct GemmBwd1x1_stride2; - friend struct GemmBwd1x1_stride1; - friend struct GemmBwdRest; -}; - -struct GemmBwd1x1_stride2 final : GemmBwdBase -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - bool MayNeedWorkspace() const override { return true; } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - friend struct GemmBwdRest; -}; - -struct GemmBwd1x1_stride1 final : GemmBwdBase -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - bool MayNeedWorkspace() const override { return true; } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, - const miopen::conv::ProblemDescription& problem) const override; - - MIOPEN_INTERNALS_EXPORT ConvSolution GetSolution( - const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const override; - - friend struct GemmBwdRest; -}; - -struct GemmBwdRest final : GemmBwdBase -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - bool MayNeedWorkspace() const override { return true; } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; -}; - -struct GemmWrwBase : ConvSolver -{ - bool IsDynamic() const override { return true; } - MIOPEN_INTERNALS_EXPORT float GetWti(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const override; - -private: - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - friend struct GemmWrw1x1_stride1; - friend struct GemmWrwUniversal; -}; - -struct GemmWrw1x1_stride1 final : GemmWrwBase -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - friend struct GemmWrwUniversal; -}; - -struct GemmWrwUniversal final : GemmWrwBase -{ - const std::string& SolverDbId() const override { return GetSolverDbId(); } - - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - bool MayNeedWorkspace() const override { return true; } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; -}; - -struct PerformanceConfigAsmImplicitGemmGTC : PerfConfigBase -{ - std::string direction; - std::string tensor_layout; - std::string precision; - int nxb; - int nxe; - - int gemm_m_per_block; - int gemm_n_per_block; - int gemm_k_per_block; - - int wave_tile_m; - int wave_tile_n; - int wave_tile_k; - int wave_step_m; - int wave_step_n; - int wave_repeat_m; - int wave_repeat_n; - - int multihead; - int vector_store; - int gemm_k_global_split; - int merge_e; - int tensor_a_pass_through; - - std::vector tensor_a_thread_lengths; - std::vector tensor_a_cluster_lengths; - std::vector tensor_b_thread_lengths; - std::vector tensor_b_cluster_lengths; - - bool use_spare_set; - int index; - - MIOPEN_INTERNALS_EXPORT PerformanceConfigAsmImplicitGemmGTC(std::string dir, - std::string layout, - std::string prec, - int b, - int e, - int mpb, - int npb, - int kpb, - int wtm, - int wtn, - int wtk, - int wsm, - int wsn, - int wrm, - int wrn, - int mh, - int vs, - int gks, - int me, - int pta, - std::initializer_list ta_t, - std::initializer_list ta_c, - std::initializer_list tb_t, - std::initializer_list tb_c, - bool spare = false); - MIOPEN_INTERNALS_EXPORT PerformanceConfigAsmImplicitGemmGTC(std::string dir, - std::string layout, - miopenDataType_t prec, - int b, - int e, - int mpb, - int npb, - int kpb, - int wtm, - int wtn, - int wtk, - int wsm, - int wsn, - int wrm, - int wrn, - int mh, - int vs, - int gks, - int me, - int pta, - std::initializer_list ta_t, - std::initializer_list ta_c, - std::initializer_list tb_t, - std::initializer_list tb_c, - bool spare = false); - PerformanceConfigAsmImplicitGemmGTC() - : PerformanceConfigAsmImplicitGemmGTC("fwd", - "nchw", - "fp32", - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - false) - { - } - PerformanceConfigAsmImplicitGemmGTC(bool spare) - : PerformanceConfigAsmImplicitGemmGTC("fwd", - "nchw", - "fp32", - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - spare) - { - } - - template - static void Visit(Self&& self, F f) - { - f(self.direction, "dir"); - f(self.tensor_layout, "lyt"); - f(self.precision, "pre"); - f(self.nxb, "nxb"); - f(self.nxe, "nxe"); - f(self.gemm_m_per_block, "mpb"); - f(self.gemm_n_per_block, "npb"); - f(self.gemm_k_per_block, "kpb"); - - f(self.wave_tile_m, "wtm"); - f(self.wave_tile_n, "wtn"); - f(self.wave_tile_k, "wtk"); - f(self.wave_step_m, "wsm"); - f(self.wave_step_n, "wsn"); - f(self.wave_repeat_m, "wrm"); - f(self.wave_repeat_n, "wrn"); - - f(self.multihead, "mh"); - f(self.vector_store, "vs"); - f(self.gemm_k_global_split, "gks"); - f(self.merge_e, "me"); - f(self.tensor_a_pass_through, "pta"); - - f(self.tensor_a_thread_lengths[0], "ta0"); - f(self.tensor_a_thread_lengths[1], "ta1"); - f(self.tensor_a_thread_lengths[2], "ta2"); - f(self.tensor_a_thread_lengths[3], "ta3"); - - f(self.tensor_a_cluster_lengths[0], "ca0"); - f(self.tensor_a_cluster_lengths[1], "ca1"); - f(self.tensor_a_cluster_lengths[2], "ca2"); - f(self.tensor_a_cluster_lengths[3], "ca3"); - - f(self.tensor_b_thread_lengths[0], "tb0"); - f(self.tensor_b_thread_lengths[1], "tb1"); - f(self.tensor_b_thread_lengths[2], "tb2"); - f(self.tensor_b_thread_lengths[3], "tb3"); - - f(self.tensor_b_cluster_lengths[0], "cb0"); - f(self.tensor_b_cluster_lengths[1], "cb1"); - f(self.tensor_b_cluster_lengths[2], "cb2"); - f(self.tensor_b_cluster_lengths[3], "cb3"); - f(self.index, "index"); - } - - // Chilrden must provide support for ComputedContainer. - void HeuristicInit(const ExecutionContext&) = delete; - bool SetNextValue(const miopen::conv::ProblemDescription&) = delete; - bool IsValidValue() const = delete; - bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription&) const = delete; - - MIOPEN_INTERNALS_EXPORT bool IsDefaultConstructed() const; - MIOPEN_INTERNALS_EXPORT bool operator==(const PerformanceConfigAsmImplicitGemmGTC& other) const; - MIOPEN_INTERNALS_EXPORT void CopyParameters(const PerformanceConfigAsmImplicitGemmGTC& other); - MIOPEN_INTERNALS_EXPORT std::string ToString() const override; - MIOPEN_INTERNALS_EXPORT std::string ToKernelName(const ExecutionContext&) const; - MIOPEN_INTERNALS_EXPORT int BlockSize() const; -}; - -struct PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC : PerformanceConfigAsmImplicitGemmGTC -{ - PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC(std::string dir, - std::string layout, - std::string prec, - int b, - int e, - int mpb, - int npb, - int kpb, - int wtm, - int wtn, - int wtk, - int wsm, - int wsn, - int wrm, - int wrn, - int mh, - int vs, - int gks, - int me, - int pta, - std::initializer_list ta_t, - std::initializer_list ta_c, - std::initializer_list tb_t, - std::initializer_list tb_c, - bool spare = false) - : PerformanceConfigAsmImplicitGemmGTC(dir, - layout, - prec, - b, - e, - mpb, - npb, - kpb, - wtm, - wtn, - wtk, - wsm, - wsn, - wrm, - wrn, - mh, - vs, - gks, - me, - pta, - ta_t, - ta_c, - tb_t, - tb_c, - spare) - { - } - PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC(std::string dir, - std::string layout, - miopenDataType_t prec, - int b, - int e, - int mpb, - int npb, - int kpb, - int wtm, - int wtn, - int wtk, - int wsm, - int wsn, - int wrm, - int wrn, - int mh, - int vs, - int gks, - int me, - int pta, - std::initializer_list ta_t, - std::initializer_list ta_c, - std::initializer_list tb_t, - std::initializer_list tb_c, - bool spare = false) - : PerformanceConfigAsmImplicitGemmGTC(dir, - layout, - prec, - b, - e, - mpb, - npb, - kpb, - wtm, - wtn, - wtk, - wsm, - wsn, - wrm, - wrn, - mh, - vs, - gks, - me, - pta, - ta_t, - ta_c, - tb_t, - tb_c, - spare) - { - } - PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC() - : PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC("fwd", - "nchw", - "fp32", - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - false) - { - } - PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC(bool spare) - : PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC("fwd", - "nchw", - "fp32", - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - spare) - { - } - - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const ExecutionContext&, - const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription& config); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const - { - return IsValid(problem); - } - MIOPEN_INTERNALS_EXPORT bool IsValid(const miopen::conv::ProblemDescription&) const; -}; - -struct ConvAsmImplicitGemmGTCDynamicFwdXdlopsNHWC final - : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC - GetDefaultPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool IsValidPerformanceConfig( - const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool MayNeedWorkspace() const override { return true; } - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool IsDynamic() const override { return true; } - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigAsmImplicitGemmGTCFwdXdlopsNHWC&) const override; -}; - -struct PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC : PerformanceConfigAsmImplicitGemmGTC -{ - PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC(std::string dir, - std::string layout, - std::string prec, - int b, - int e, - int mpb, - int npb, - int kpb, - int wtm, - int wtn, - int wtk, - int wsm, - int wsn, - int wrm, - int wrn, - int mh, - int vs, - int gks, - int me, - int pta, - std::initializer_list ta_t, - std::initializer_list ta_c, - std::initializer_list tb_t, - std::initializer_list tb_c, - bool spare = false) - : PerformanceConfigAsmImplicitGemmGTC(dir, - layout, - prec, - b, - e, - mpb, - npb, - kpb, - wtm, - wtn, - wtk, - wsm, - wsn, - wrm, - wrn, - mh, - vs, - gks, - me, - pta, - ta_t, - ta_c, - tb_t, - tb_c, - spare) - { - } - PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC(std::string dir, - std::string layout, - miopenDataType_t prec, - int b, - int e, - int mpb, - int npb, - int kpb, - int wtm, - int wtn, - int wtk, - int wsm, - int wsn, - int wrm, - int wrn, - int mh, - int vs, - int gks, - int me, - int pta, - std::initializer_list ta_t, - std::initializer_list ta_c, - std::initializer_list tb_t, - std::initializer_list tb_c, - bool spare = false) - : PerformanceConfigAsmImplicitGemmGTC(dir, - layout, - prec, - b, - e, - mpb, - npb, - kpb, - wtm, - wtn, - wtk, - wsm, - wsn, - wrm, - wrn, - mh, - vs, - gks, - me, - pta, - ta_t, - ta_c, - tb_t, - tb_c, - spare) - { - } - PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC() - : PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC("fwd", - "nchw", - "fp32", - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - false) - { - } - PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC(bool spare) - : PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC("fwd", - "nchw", - "fp32", - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - spare) - { - } - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const ExecutionContext&, - const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const - { - return IsValid(problem); - } - MIOPEN_INTERNALS_EXPORT bool IsValid(const miopen::conv::ProblemDescription&) const; -}; - -struct ConvAsmImplicitGemmGTCDynamicBwdXdlopsNHWC final - : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC - GetDefaultPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool IsValidPerformanceConfig( - const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool MayNeedWorkspace() const override { return true; } - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool IsDynamic() const override { return true; } - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigAsmImplicitGemmGTCBwdXdlopsNHWC&) const override; -}; - -struct PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC : PerformanceConfigAsmImplicitGemmGTC -{ - PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC(std::string dir, - std::string layout, - std::string prec, - int b, - int e, - int mpb, - int npb, - int kpb, - int wtm, - int wtn, - int wtk, - int wsm, - int wsn, - int wrm, - int wrn, - int mh, - int vs, - int gks, - int me, - int pta, - std::initializer_list ta_t, - std::initializer_list ta_c, - std::initializer_list tb_t, - std::initializer_list tb_c, - bool spare = false) - : PerformanceConfigAsmImplicitGemmGTC(dir, - layout, - prec, - b, - e, - mpb, - npb, - kpb, - wtm, - wtn, - wtk, - wsm, - wsn, - wrm, - wrn, - mh, - vs, - gks, - me, - pta, - ta_t, - ta_c, - tb_t, - tb_c, - spare) - { - } - PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC(std::string dir, - std::string layout, - miopenDataType_t prec, - int b, - int e, - int mpb, - int npb, - int kpb, - int wtm, - int wtn, - int wtk, - int wsm, - int wsn, - int wrm, - int wrn, - int mh, - int vs, - int gks, - int me, - int pta, - std::initializer_list ta_t, - std::initializer_list ta_c, - std::initializer_list tb_t, - std::initializer_list tb_c, - bool spare = false) - : PerformanceConfigAsmImplicitGemmGTC(dir, - layout, - prec, - b, - e, - mpb, - npb, - kpb, - wtm, - wtn, - wtk, - wsm, - wsn, - wrm, - wrn, - mh, - vs, - gks, - me, - pta, - ta_t, - ta_c, - tb_t, - tb_c, - spare) - { - } - PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC() - : PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC("fwd", - "nchw", - "fp32", - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - false) - { - } - PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC(bool spare) - : PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC("fwd", - "nchw", - "fp32", - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - spare) - { - } - - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const ExecutionContext&, - const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const - { - return IsValid(problem); - } - MIOPEN_INTERNALS_EXPORT bool IsValid(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT size_t ComputeKernelOccupancy() const; - -private: - void SetParamsForKSplit(const miopen::conv::ProblemDescription& problem, - const size_t& occupancy); -}; - -struct ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC final - : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC - GetDefaultPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool IsValidPerformanceConfig( - const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool MayNeedWorkspace() const override { return true; } - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool IsDynamic() const override { return true; } - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC&) const override; -}; - -struct PerformanceConfigAsmImplicitGemmGTCvector - : PerfConfigBase -{ - std::string direction; - std::string tensor_layout; - std::string precision; - int nxb; - int nxe; - - int gemm_m_per_block; - int gemm_n_per_block; - int gemm_k_per_block; - - int lanegroup_tile_m; - int lanegroup_tile_n; - int lanegroup_wave_m; - int lanegroup_wave_n; - int lanegroup_repeat_m; - int lanegroup_repeat_n; - - int vector_c; - - std::vector tensor_a_thread_lengths; - std::vector tensor_a_cluster_lengths; - std::vector tensor_b_thread_lengths; - std::vector tensor_b_cluster_lengths; - - bool use_spare_set; - int index; - - MIOPEN_INTERNALS_EXPORT - PerformanceConfigAsmImplicitGemmGTCvector(std::string dir, - std::string layout, - std::string prec, - int b, - int e, - int mpb, - int npb, - int kpb, - int lgtm, - int lgtn, - int lgpwm, - int lgpwn, - int lgrm, - int lgrn, - int vec_c, - std::initializer_list ta_t, - std::initializer_list ta_c, - std::initializer_list tb_t, - std::initializer_list tb_c, - bool spare = false); - - MIOPEN_INTERNALS_EXPORT - PerformanceConfigAsmImplicitGemmGTCvector(std::string dir, - std::string layout, - miopenDataType_t prec, - int b, - int e, - int mpb, - int npb, - int kpb, - int lgtm, - int lgtn, - int lgpwm, - int lgpwn, - int lgrm, - int lgrn, - int vec_c, - std::initializer_list ta_t, - std::initializer_list ta_c, - std::initializer_list tb_t, - std::initializer_list tb_c, - bool spare = false); - - PerformanceConfigAsmImplicitGemmGTCvector() - : PerformanceConfigAsmImplicitGemmGTCvector("fwd", - "nchwc_kcyxc", - "Half", - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - false) - { - } - PerformanceConfigAsmImplicitGemmGTCvector(bool spare) - : PerformanceConfigAsmImplicitGemmGTCvector("fwd", - "nchwc_kcyxc", - "Half", - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - spare) - { - } - - template - static void Visit(Self&& self, F f) - { - f(self.direction, "dir"); - f(self.tensor_layout, "lyt"); - f(self.precision, "pre"); - f(self.nxb, "nxb"); - f(self.nxe, "nxe"); - f(self.gemm_m_per_block, "mpb"); - f(self.gemm_n_per_block, "npb"); - f(self.gemm_k_per_block, "kpb"); - - f(self.lanegroup_tile_m, "lgtm"); - f(self.lanegroup_tile_n, "lgtn"); - f(self.lanegroup_wave_m, "lgpwm"); - f(self.lanegroup_wave_n, "lgpwn"); - f(self.lanegroup_repeat_m, "lgrm"); - f(self.lanegroup_repeat_n, "lgrn"); - - f(self.vector_c, "vec_c"); - - f(self.tensor_a_thread_lengths[0], "ta0"); - f(self.tensor_a_thread_lengths[1], "ta1"); - f(self.tensor_a_thread_lengths[2], "ta2"); - f(self.tensor_a_thread_lengths[3], "ta3"); - - f(self.tensor_a_cluster_lengths[0], "ca0"); - f(self.tensor_a_cluster_lengths[1], "ca1"); - f(self.tensor_a_cluster_lengths[2], "ca2"); - f(self.tensor_a_cluster_lengths[3], "ca3"); - - f(self.tensor_b_thread_lengths[0], "tb0"); - f(self.tensor_b_thread_lengths[1], "tb1"); - f(self.tensor_b_thread_lengths[2], "tb2"); - f(self.tensor_b_thread_lengths[3], "tb3"); - - f(self.tensor_b_cluster_lengths[0], "cb0"); - f(self.tensor_b_cluster_lengths[1], "cb1"); - f(self.tensor_b_cluster_lengths[2], "cb2"); - f(self.tensor_b_cluster_lengths[3], "cb3"); - f(self.index, "index"); - } - - // Chilrden must provide support for ComputedContainer. - void HeuristicInit(const ExecutionContext&) = delete; - bool SetNextValue(const miopen::conv::ProblemDescription&) = delete; - bool IsValidValue() const = delete; - bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription&) const = delete; - - MIOPEN_INTERNALS_EXPORT bool IsDefaultConstructed() const; - MIOPEN_INTERNALS_EXPORT bool - operator==(const PerformanceConfigAsmImplicitGemmGTCvector& other) const; - MIOPEN_INTERNALS_EXPORT void - CopyParameters(const PerformanceConfigAsmImplicitGemmGTCvector& other); - MIOPEN_INTERNALS_EXPORT std::string ToString() const override; - MIOPEN_INTERNALS_EXPORT std::string ToKernelName(const ExecutionContext&) const; - MIOPEN_INTERNALS_EXPORT int BlockSize() const; -}; -struct PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC : PerformanceConfigAsmImplicitGemmGTCvector -{ - - PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC(std::string dir, - std::string layout, - std::string prec, - int b, - int e, - int mpb, - int npb, - int kpb, - int lgtm, - int lgtn, - int lgpwm, - int lgpwn, - int lgrm, - int lgrn, - int vec_c, - std::initializer_list ta_t, - std::initializer_list ta_c, - std::initializer_list tb_t, - std::initializer_list tb_c, - bool spare = false) - : PerformanceConfigAsmImplicitGemmGTCvector(dir, - layout, - prec, - b, - e, - mpb, - npb, - kpb, - lgtm, - lgtn, - lgpwm, - lgpwn, - lgrm, - lgrn, - vec_c, - ta_t, - ta_c, - tb_t, - tb_c, - spare) - { - } - - PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC(std::string dir, - std::string layout, - miopenDataType_t prec, - int b, - int e, - int mpb, - int npb, - int kpb, - int lgtm, - int lgtn, - int lgpwm, - int lgpwn, - int lgrm, - int lgrn, - int vec_c, - std::initializer_list ta_t, - std::initializer_list ta_c, - std::initializer_list tb_t, - std::initializer_list tb_c, - bool spare = false) - : PerformanceConfigAsmImplicitGemmGTCvector(dir, - layout, - prec, - b, - e, - mpb, - npb, - kpb, - lgtm, - lgtn, - lgpwm, - lgpwn, - lgrm, - lgrn, - vec_c, - ta_t, - ta_c, - tb_t, - tb_c, - spare) - { - } - - PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC() - : PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC("fwd", - "nchwc_kcyxc", - "Half", - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - false) - { - } - PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC(bool spare) - : PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC("fwd", - "nchwc_kcyxc", - "Half", - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - {1, 1, 1, 1}, - spare) - { - } - - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const - { - return IsValid(problem); - } - MIOPEN_INTERNALS_EXPORT bool IsValid(const miopen::conv::ProblemDescription&) const; -}; - -struct ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC final - : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - MIOPEN_INTERNALS_EXPORT PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC - GetDefaultPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool IsValidPerformanceConfig( - const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool IsDynamic() const override { return true; } - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigAsmImplicitGemmGTCFwdDlopsNCHWC&) const override; -}; - -struct PerformanceConfigHipImplicitGemmFwdXdlops - : PerfConfigBaseCK -{ - int index = 0; - std::string kernel_id = ""; - std::vector valid_kernels; - - PerformanceConfigHipImplicitGemmFwdXdlops(int idx, std::string kernl_id) - : index(idx), kernel_id(kernl_id) - { - } - - PerformanceConfigHipImplicitGemmFwdXdlops() = default; - - explicit PerformanceConfigHipImplicitGemmFwdXdlops(bool) - : PerformanceConfigHipImplicitGemmFwdXdlops(0, "") - { - } - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const - { - return IsValid(problem); - } - MIOPEN_INTERNALS_EXPORT bool IsValid(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool - operator==(const PerformanceConfigHipImplicitGemmFwdXdlops& other) const; - -private: - template - void Init(const miopen::conv::ProblemDescription&); - template - bool CheckIsSupportCKArgs(const miopen::conv::ProblemDescription&) const; -}; - -struct ConvHipImplicitGemmFwdXdlops final - : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT PerformanceConfigHipImplicitGemmFwdXdlops GetDefaultPerformanceConfig( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigHipImplicitGemmFwdXdlops&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConfigHipImplicitGemmFwdXdlops - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool IsDynamic() const override { return true; } - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigHipImplicitGemmFwdXdlops&) const override; - /// \anchor igemm_get_wti_magic_number - // Magic Number Alert: - // Naive convolutions have GetWti() that return very small value (0.01f). - // This allows MIOpen to use Naive Solvers if no other applicable Solvers - // have known WTIs. Right now this means that in case of find-db miss, - // the library will try to use Winograd or GEMM (whatever is faster according - // to their GetWti's), but if both are not applicable, the library will - // use Naive Solver - // Since we would like to us CK before naive, and use it instead (because - // we do expect that CK is faster than Naive), therefore we use a - // value bigger than 0.01f, e.g. 0.02f. - float GetWti(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override - { - return 0.02f; - }; - -private: - template - bool CheckCKApplicability(const miopen::conv::ProblemDescription&) const; -}; - -struct PerformanceConfigHipImplicitGemmBwdXdlops - : PerfConfigBaseCK -{ - int index = 0; - std::string kernel_id = ""; - std::vector valid_kernels; - - PerformanceConfigHipImplicitGemmBwdXdlops(int idx, std::string kernl_id) - : index(idx), kernel_id(kernl_id) - { - } - - PerformanceConfigHipImplicitGemmBwdXdlops() = default; - - explicit PerformanceConfigHipImplicitGemmBwdXdlops(bool) - : PerformanceConfigHipImplicitGemmBwdXdlops(0, "") - { - } - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const - { - return IsValid(problem); - } - MIOPEN_INTERNALS_EXPORT bool IsValid(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool - operator==(const PerformanceConfigHipImplicitGemmBwdXdlops& other) const; - -private: - template - void Init(const miopen::conv::ProblemDescription&); - template - bool CheckIsSupportCKArgs(const miopen::conv::ProblemDescription&) const; -}; - -struct ConvHipImplicitGemmBwdXdlops final - : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT PerformanceConfigHipImplicitGemmBwdXdlops GetDefaultPerformanceConfig( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigHipImplicitGemmBwdXdlops&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConfigHipImplicitGemmBwdXdlops - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool IsDynamic() const override { return true; } - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigHipImplicitGemmBwdXdlops&) const override; - /// \ref igemm_get_wti_magic_number - float GetWti(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override - { - return 0.02f; - }; - -private: - template - bool CheckCKApplicability(const miopen::conv::ProblemDescription&) const; -}; - -struct PerformanceConfigHipImplicitGemmGroupFwdXdlops - : PerfConfigBaseCK -{ - int index = 0; - std::string kernel_id = ""; - std::vector valid_kernels; - - PerformanceConfigHipImplicitGemmGroupFwdXdlops(int idx, std::string kernl_id) - : index(idx), kernel_id(kernl_id) - { - } - - PerformanceConfigHipImplicitGemmGroupFwdXdlops() = default; - - explicit PerformanceConfigHipImplicitGemmGroupFwdXdlops(bool) - : PerformanceConfigHipImplicitGemmGroupFwdXdlops(0, "") - { - } - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const ExecutionContext&, - const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const - { - return IsValid(problem); - } - MIOPEN_INTERNALS_EXPORT bool IsValid(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool - operator==(const PerformanceConfigHipImplicitGemmGroupFwdXdlops& other) const; - MIOPEN_INTERNALS_EXPORT bool - IsModelApplicable(const ExecutionContext& ctx, - const miopen::conv::ProblemDescription& problem) const; - -private: -#if MIOPEN_ENABLE_AI_KERNEL_TUNING - std::vector heuristic_indexes; - std::unordered_map> heuristic_kernels; - template - bool RunParameterPredictionModel(const ExecutionContext& ctx, - const miopen::conv::ProblemDescription& problem); - void InitHeuristicKernelIDs(const std::string& type); - bool ModelApplyToken(int idx, std::string value, const std::string& arch); -#endif - template - void Init(const miopen::conv::ProblemDescription&); - template - bool CheckIsSupportCKArgs(const miopen::conv::ProblemDescription&) const; -}; - -struct ConvHipImplicitGemmGroupFwdXdlops final - : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT PerformanceConfigHipImplicitGemmGroupFwdXdlops - GetDefaultPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigHipImplicitGemmGroupFwdXdlops&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConfigHipImplicitGemmGroupFwdXdlops - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool IsDynamic() const override { return true; } - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigHipImplicitGemmGroupFwdXdlops&) const override; - /// \ref igemm_get_wti_magic_number - float GetWti(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override - { - return 0.02f; - }; - - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool MayNeedWorkspace() const override { return true; } - -private: - template - bool CheckCKApplicability(const miopen::conv::ProblemDescription&) const; -}; - -struct PerformanceConfigHipImplicitGemm3DGroupFwdXdlops - : PerfConfigBaseCK -{ - int index = 0; - std::string kernel_id = ""; - std::vector valid_kernels; - - PerformanceConfigHipImplicitGemm3DGroupFwdXdlops(int idx, std::string kernl_id) - : index(idx), kernel_id(kernl_id) - { - } - - PerformanceConfigHipImplicitGemm3DGroupFwdXdlops() = default; - - explicit PerformanceConfigHipImplicitGemm3DGroupFwdXdlops(bool) - : PerformanceConfigHipImplicitGemm3DGroupFwdXdlops(0, "") - { - } - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const - { - return IsValid(problem); - } - MIOPEN_INTERNALS_EXPORT bool IsValid(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool - operator==(const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops& other) const; - -private: - template - void Init(const miopen::conv::ProblemDescription&); - template - bool CheckIsSupportCKArgs(const miopen::conv::ProblemDescription&) const; -}; - -struct ConvHipImplicitGemm3DGroupFwdXdlops final - : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT PerformanceConfigHipImplicitGemm3DGroupFwdXdlops - GetDefaultPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool IsValidPerformanceConfig( - const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConfigHipImplicitGemm3DGroupFwdXdlops - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool IsDynamic() const override { return true; } - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops&) const override; - /// \ref igemm_get_wti_magic_number - float GetWti(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override - { - return 0.02f; - }; - - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool MayNeedWorkspace() const override { return true; } - -private: - template - bool CheckCKApplicability(const miopen::conv::ProblemDescription&) const; -}; - -struct ConvHipImplicitGemmGroupFwdXdlopsCodegen final : ConvSolver -{ - // TODO: update this fcn - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - bool MayNeedWorkspace() const override { return true; } - - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; -}; - -struct PerformanceConfigHipImplicitGemm3DGroupWrwXdlops - : PerfConfigBaseCK -{ - int index; - std::string kernel_id; - std::vector valid_kernels; - PerformanceConfigHipImplicitGemm3DGroupWrwXdlops(int idx, std::string kernl_id) - : index(idx), kernel_id(kernl_id) - { - } - PerformanceConfigHipImplicitGemm3DGroupWrwXdlops() - : PerformanceConfigHipImplicitGemm3DGroupWrwXdlops(0, "") - { - } - PerformanceConfigHipImplicitGemm3DGroupWrwXdlops(bool) - : PerformanceConfigHipImplicitGemm3DGroupWrwXdlops(0, "") - { - } - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const - { - return IsValid(problem); - } - MIOPEN_INTERNALS_EXPORT bool IsValid(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool - operator==(const PerformanceConfigHipImplicitGemm3DGroupWrwXdlops& other) const; - -private: - template - void Init(const miopen::conv::ProblemDescription&); - template - bool CheckIsSupportCKArgs(const miopen::conv::ProblemDescription&) const; -}; - -struct ConvHipImplicitGemm3DGroupWrwXdlops final - : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT PerformanceConfigHipImplicitGemm3DGroupWrwXdlops - GetDefaultPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool IsValidPerformanceConfig( - const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigHipImplicitGemm3DGroupWrwXdlops&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConfigHipImplicitGemm3DGroupWrwXdlops - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool IsDynamic() const override { return true; } - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigHipImplicitGemm3DGroupWrwXdlops&) const override; - /// \ref igemm_get_wti_magic_number - float GetWti(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override - { - return 0.02f; - }; - - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool MayNeedWorkspace() const override { return true; } - -private: - template - bool CheckCKApplicability(const miopen::conv::ProblemDescription&) const; -}; - -struct PerformanceConfigHipImplicitGemm3DGroupBwdXdlops - : PerfConfigBaseCK -{ - int index; - std::string kernel_id; - std::vector valid_kernels; - PerformanceConfigHipImplicitGemm3DGroupBwdXdlops(int idx, std::string kernl_id) - : index(idx), kernel_id(kernl_id) - { - } - PerformanceConfigHipImplicitGemm3DGroupBwdXdlops() - : PerformanceConfigHipImplicitGemm3DGroupBwdXdlops(0, "") - { - } - PerformanceConfigHipImplicitGemm3DGroupBwdXdlops(bool) - : PerformanceConfigHipImplicitGemm3DGroupBwdXdlops(0, "") - { - } - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const - { - return IsValid(problem); - } - MIOPEN_INTERNALS_EXPORT bool IsValid(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool - operator==(const PerformanceConfigHipImplicitGemm3DGroupBwdXdlops& other) const; - -private: - template - void Init(const miopen::conv::ProblemDescription&); - template - bool CheckIsSupportCKArgs(const miopen::conv::ProblemDescription&) const; -}; - -struct ConvHipImplicitGemm3DGroupBwdXdlops final - : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT PerformanceConfigHipImplicitGemm3DGroupBwdXdlops - GetDefaultPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool IsValidPerformanceConfig( - const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigHipImplicitGemm3DGroupBwdXdlops&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConfigHipImplicitGemm3DGroupBwdXdlops - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool IsDynamic() const override { return true; } - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigHipImplicitGemm3DGroupBwdXdlops&) const override; - /// \ref igemm_get_wti_magic_number - float GetWti(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override - { - return 0.02f; - }; - - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool MayNeedWorkspace() const override { return true; } - -private: - template - bool CheckCKApplicability(const miopen::conv::ProblemDescription&) const; -}; - -struct PerformanceConfigHipImplicitGemmGroupBwdXdlops - : PerfConfigBaseCK -{ - int index; - std::string kernel_id; - std::vector valid_kernels; - PerformanceConfigHipImplicitGemmGroupBwdXdlops(int idx, std::string kernl_id) - : index(idx), kernel_id(kernl_id) - { - } - PerformanceConfigHipImplicitGemmGroupBwdXdlops() - : PerformanceConfigHipImplicitGemmGroupBwdXdlops(0, "") - { - } - PerformanceConfigHipImplicitGemmGroupBwdXdlops(bool) - : PerformanceConfigHipImplicitGemmGroupBwdXdlops(0, "") - { - } - - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const ExecutionContext&, - const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const - { - return IsValid(problem); - } - MIOPEN_INTERNALS_EXPORT bool IsValid(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool - operator==(const PerformanceConfigHipImplicitGemmGroupBwdXdlops& other) const; - MIOPEN_INTERNALS_EXPORT bool - IsModelApplicable(const ExecutionContext& ctx, - const miopen::conv::ProblemDescription& problem) const; - -private: -#if MIOPEN_ENABLE_AI_KERNEL_TUNING - std::vector heuristic_indexes; - std::unordered_map> heuristic_kernels; - template - bool RunParameterPredictionModel(const ExecutionContext& ctx, - const miopen::conv::ProblemDescription& problem); - void InitHeuristicKernelIDs(); - bool ModelApplyToken(int idx, std::string value); -#endif - template - void Init(const miopen::conv::ProblemDescription&); - template - bool CheckIsSupportCKArgs(const miopen::conv::ProblemDescription&) const; -}; - -struct ConvHipImplicitGemmGroupBwdXdlops final - : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT PerformanceConfigHipImplicitGemmGroupBwdXdlops - GetDefaultPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigHipImplicitGemmGroupBwdXdlops&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConfigHipImplicitGemmGroupBwdXdlops - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool IsDynamic() const override { return true; } - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigHipImplicitGemmGroupBwdXdlops&) const override; - /// \ref igemm_get_wti_magic_number - float GetWti(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override - { - return 0.02f; - }; - - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool MayNeedWorkspace() const override { return true; } - -private: - template - bool CheckCKApplicability(const miopen::conv::ProblemDescription&) const; -}; - -struct PerformanceConfigHipImplicitGemmGroupWrwXdlops - : PerfConfigBaseCK -{ - int index; - int split_k; - std::string kernel_id; - std::vector valid_kernels; - PerformanceConfigHipImplicitGemmGroupWrwXdlops(int idx, std::string kernl_id) - : index(idx), kernel_id(kernl_id) - { - } - PerformanceConfigHipImplicitGemmGroupWrwXdlops() - : PerformanceConfigHipImplicitGemmGroupWrwXdlops(0, "") - { - } - PerformanceConfigHipImplicitGemmGroupWrwXdlops(bool) - : PerformanceConfigHipImplicitGemmGroupWrwXdlops(0, "") - { - } - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const ExecutionContext&, - const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const - { - return IsValid(problem); - } - MIOPEN_INTERNALS_EXPORT bool IsValid(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool - operator==(const PerformanceConfigHipImplicitGemmGroupWrwXdlops& other) const; - MIOPEN_INTERNALS_EXPORT bool - IsModelApplicable(const ExecutionContext& ctx, - const miopen::conv::ProblemDescription& problem) const; - -private: -#if MIOPEN_ENABLE_AI_KERNEL_TUNING - std::vector heuristic_indexes; - std::unordered_map> heuristic_kernels; - template - bool RunParameterPredictionModel(const ExecutionContext& ctx, - const miopen::conv::ProblemDescription& problem); - void InitHeuristicKernelIDs(const std::string& type); - bool ModelApplyToken(int idx, - std::string value, - const std::string& arch, - const miopen::conv::ProblemDescription& problem); -#endif - template - void Init(const miopen::conv::ProblemDescription&); - template - bool CheckIsSupportCKArgs(const miopen::conv::ProblemDescription&) const; -}; - -struct ConvHipImplicitGemmGroupWrwXdlops final - : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT PerformanceConfigHipImplicitGemmGroupWrwXdlops - GetDefaultPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigHipImplicitGemmGroupWrwXdlops&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConfigHipImplicitGemmGroupWrwXdlops - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool IsDynamic() const override { return true; } - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigHipImplicitGemmGroupWrwXdlops&) const override; - /// \ref igemm_get_wti_magic_number - float GetWti(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override - { - return 0.02f; - }; - - MIOPEN_INTERNALS_EXPORT size_t GetWorkspaceSize( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool MayNeedWorkspace() const override { return true; } - -private: - template - bool CheckCKApplicability(const miopen::conv::ProblemDescription&) const; -}; - -struct PerformanceConfigHipImplicitGemmF16F8F16FwdXdlops - : PerfConfigBaseCK -{ - int index = 0; - std::string kernel_id = ""; - std::vector valid_kernels; - - PerformanceConfigHipImplicitGemmF16F8F16FwdXdlops(int idx, std::string kernl_id) - : index(idx), kernel_id(kernl_id) - { - } - - PerformanceConfigHipImplicitGemmF16F8F16FwdXdlops() = default; - - explicit PerformanceConfigHipImplicitGemmF16F8F16FwdXdlops(bool) - : PerformanceConfigHipImplicitGemmF16F8F16FwdXdlops(0, "") - { - } - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const - { - return IsValid(problem); - } - MIOPEN_INTERNALS_EXPORT bool IsValid(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool - operator==(const PerformanceConfigHipImplicitGemmF16F8F16FwdXdlops& other) const; - -private: - template - void Init(const miopen::conv::ProblemDescription&); - template - bool CheckIsSupportCKArgs(const miopen::conv::ProblemDescription&) const; -}; - -struct ConvHipImplicitGemmF16F8F16FwdXdlops final - : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT PerformanceConfigHipImplicitGemmF16F8F16FwdXdlops - GetDefaultPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool IsValidPerformanceConfig( - const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigHipImplicitGemmF16F8F16FwdXdlops&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConfigHipImplicitGemmF16F8F16FwdXdlops - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool IsDynamic() const override { return true; } - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigHipImplicitGemmF16F8F16FwdXdlops&) const override; - /// \ref igemm_get_wti_magic_number - float GetWti(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override - { - return 0.02f; - }; - -private: - template - bool CheckCKApplicability(const miopen::conv::ProblemDescription&) const; -}; - -struct PerformanceConfigHipImplicitGemmF16F8F16BwdXdlops - : PerfConfigBaseCK -{ - int index; - std::string kernel_id; - std::vector valid_kernels; - PerformanceConfigHipImplicitGemmF16F8F16BwdXdlops(int idx, std::string kernl_id) - : index(idx), kernel_id(kernl_id) - { - } - PerformanceConfigHipImplicitGemmF16F8F16BwdXdlops() - : PerformanceConfigHipImplicitGemmF16F8F16BwdXdlops(0, "") - { - } - PerformanceConfigHipImplicitGemmF16F8F16BwdXdlops(bool) - : PerformanceConfigHipImplicitGemmF16F8F16BwdXdlops(0, "") - { - } - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const - { - return IsValid(problem); - } - MIOPEN_INTERNALS_EXPORT bool IsValid(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool - operator==(const PerformanceConfigHipImplicitGemmF16F8F16BwdXdlops& other) const; - -private: - template - void Init(const miopen::conv::ProblemDescription&); - template - bool CheckIsSupportCKArgs(const miopen::conv::ProblemDescription&) const; -}; - -struct ConvHipImplicitGemmF16F8F16BwdXdlops final - : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT PerformanceConfigHipImplicitGemmF16F8F16BwdXdlops - GetDefaultPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool IsValidPerformanceConfig( - const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigHipImplicitGemmF16F8F16BwdXdlops&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConfigHipImplicitGemmF16F8F16BwdXdlops - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool IsDynamic() const override { return true; } - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigHipImplicitGemmF16F8F16BwdXdlops&) const override; - /// \ref igemm_get_wti_magic_number - float GetWti(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override - { - return 0.02f; - }; - -private: - template - bool CheckCKApplicability(const miopen::conv::ProblemDescription&) const; -}; - -struct PerformanceConfigHipImplicitGemmF16F8F16WrwXdlops - : PerfConfigBaseCK -{ - int index; - std::string kernel_id; - std::vector valid_kernels; - PerformanceConfigHipImplicitGemmF16F8F16WrwXdlops(int idx, std::string kernl_id) - : index(idx), kernel_id(kernl_id) - { - } - PerformanceConfigHipImplicitGemmF16F8F16WrwXdlops() - : PerformanceConfigHipImplicitGemmF16F8F16WrwXdlops(0, "") - { - } - PerformanceConfigHipImplicitGemmF16F8F16WrwXdlops(bool) - : PerformanceConfigHipImplicitGemmF16F8F16WrwXdlops(0, "") - { - } - MIOPEN_INTERNALS_EXPORT void HeuristicInit(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::conv::ProblemDescription&); - MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; - bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription& problem) const - { - return IsValid(problem); - } - MIOPEN_INTERNALS_EXPORT bool IsValid(const miopen::conv::ProblemDescription&) const; - MIOPEN_INTERNALS_EXPORT bool - operator==(const PerformanceConfigHipImplicitGemmF16F8F16WrwXdlops& other) const; - -private: - template - void Init(const miopen::conv::ProblemDescription&); - template - bool CheckIsSupportCKArgs(const miopen::conv::ProblemDescription&) const; -}; - -struct ConvHipImplicitGemmF16F8F16WrwXdlops final - : ConvTunableSolver -{ - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } - - MIOPEN_INTERNALS_EXPORT PerformanceConfigHipImplicitGemmF16F8F16WrwXdlops - GetDefaultPerformanceConfig(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT bool IsValidPerformanceConfig( - const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigHipImplicitGemmF16F8F16WrwXdlops&) const override; - MIOPEN_INTERNALS_EXPORT PerformanceConfigHipImplicitGemmF16F8F16WrwXdlops - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - bool IsDynamic() const override { return true; } - MIOPEN_INTERNALS_EXPORT ConvSolution - GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const PerformanceConfigHipImplicitGemmF16F8F16WrwXdlops&) const override; - /// \ref igemm_get_wti_magic_number - float GetWti(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override - { - return 0.02f; - }; - -private: - template - bool CheckCKApplicability(const miopen::conv::ProblemDescription&) const; -}; - -} // namespace conv - // Use struct as a syntactic sugar to make the intent as clear as possible. struct ThisSolverIsDeprecatedStatic { diff --git a/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp b/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp index d16b4d38e3..e48d4e8233 100644 --- a/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp +++ b/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp @@ -27,7 +27,7 @@ #include #include -#include +#include #include #include #include From 80a87f4a3e257e1fbb6f0d77ee5961a27c5b4610 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Fri, 18 Oct 2024 08:05:26 +0000 Subject: [PATCH 42/69] added codegen compiler flag in CK again, set default to ON --- Dockerfile | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 10bd17b5af..e4ff0b2963 100755 --- a/Dockerfile +++ b/Dockerfile @@ -114,7 +114,7 @@ DEBIAN_FRONTEND=noninteractive apt-get purge -y --allow-unauthenticated \ miopen-hip # TODO: it should be able to automatically get commit hash from requirements.txt -ARG CK_COMMIT=825d9008a68a576749a455c4953ddac7b29542aa +ARG CK_COMMIT=3340516722ea729f57ef11d2d38aded39fdbbced RUN wget -O ck.tar.gz https://www.github.com/ROCm/composable_kernel/archive/${CK_COMMIT}.tar.gz && \ tar zxvf ck.tar.gz &&\ cd composable_kernel-${CK_COMMIT} && \ diff --git a/requirements.txt b/requirements.txt index eab6cfbd68..c3f2ac83db 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@825d9008a68a576749a455c4953ddac7b29542aa -DCMAKE_BUILD_TYPE=Release +ROCm/composable_kernel@3340516722ea729f57ef11d2d38aded39fdbbced -DCMAKE_BUILD_TYPE=Release google/googletest@v1.14.0 From 0a712a1733145fb9658df64f280d464feaa7ed14 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Fri, 18 Oct 2024 19:21:21 +0000 Subject: [PATCH 43/69] resolved enable_if_t standard header usage error, added codegen compiler flag to requirements.txt --- Dockerfile | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index e4ff0b2963..9eabde8f7c 100755 --- a/Dockerfile +++ b/Dockerfile @@ -114,7 +114,7 @@ DEBIAN_FRONTEND=noninteractive apt-get purge -y --allow-unauthenticated \ miopen-hip # TODO: it should be able to automatically get commit hash from requirements.txt -ARG CK_COMMIT=3340516722ea729f57ef11d2d38aded39fdbbced +ARG CK_COMMIT=7d3ee2660b10b3c95f944655a372f53ab16667cd RUN wget -O ck.tar.gz https://www.github.com/ROCm/composable_kernel/archive/${CK_COMMIT}.tar.gz && \ tar zxvf ck.tar.gz &&\ cd composable_kernel-${CK_COMMIT} && \ diff --git a/requirements.txt b/requirements.txt index c3f2ac83db..fa41db58d5 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@3340516722ea729f57ef11d2d38aded39fdbbced -DCMAKE_BUILD_TYPE=Release +ROCm/composable_kernel@7d3ee2660b10b3c95f944655a372f53ab16667cd -DCMAKE_BUILD_TYPE=Release -DCK_USE_CODEGEN=ON google/googletest@v1.14.0 From 28a0a102143ae9ae676beb3489170ce019d14844 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Fri, 18 Oct 2024 19:45:30 +0000 Subject: [PATCH 44/69] added codegen compiler flag to Dockerfile --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 9eabde8f7c..760db11a36 100755 --- a/Dockerfile +++ b/Dockerfile @@ -124,7 +124,7 @@ RUN wget -O ck.tar.gz https://www.github.com/ROCm/composable_kernel/archive/${CK -D CMAKE_CXX_COMPILER_LAUNCHER="${COMPILER_LAUNCHER}" \ -D CMAKE_BUILD_TYPE=Release \ -D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx1100" \ - -D CMAKE_CXX_FLAGS=" -O3 " .. && \ + -D CMAKE_CXX_FLAGS=" -O3 -DCK_USE_CODEGEN=ON " .. && \ make -j $(nproc) install # Composable Kernel installed separated from rbuild to take in values from GPU_ARCHS From 8b793bc2ecf2b60fd6b413c8f1fcdaedb661b10a Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Fri, 18 Oct 2024 20:25:51 +0000 Subject: [PATCH 45/69] fixed codegen compiler flag issue --- Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 760db11a36..e52011b862 100755 --- a/Dockerfile +++ b/Dockerfile @@ -124,7 +124,8 @@ RUN wget -O ck.tar.gz https://www.github.com/ROCm/composable_kernel/archive/${CK -D CMAKE_CXX_COMPILER_LAUNCHER="${COMPILER_LAUNCHER}" \ -D CMAKE_BUILD_TYPE=Release \ -D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx1100" \ - -D CMAKE_CXX_FLAGS=" -O3 -DCK_USE_CODEGEN=ON " .. && \ + -D CMAKE_CXX_FLAGS=" -O3 " \ + -DCK_USE_CODEGEN=ON .. && \ make -j $(nproc) install # Composable Kernel installed separated from rbuild to take in values from GPU_ARCHS From 2b1191f64fddea5067c9286ece1d4eec7f56cf3f Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Mon, 28 Oct 2024 21:18:48 +0000 Subject: [PATCH 46/69] renamed codegen solver --- ...=> codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp} | 2 -- 1 file changed, 2 deletions(-) rename src/solver/{tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp => codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp} (99%) diff --git a/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp b/src/solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp similarity index 99% rename from src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp rename to src/solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp index e48d4e8233..df3be22cfb 100644 --- a/src/solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp +++ b/src/solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp @@ -249,7 +249,6 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( decltype(auto) out = problem.GetOut();**/ const auto workspace_req = GetWorkspaceSize(ctx, problem); - std::cout << "workspace: " << workspace_req << std::endl; auto soln = ConvSolution{miopenStatusSuccess}; soln.workspace_sz = workspace_req; @@ -295,7 +294,6 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); kernel.comp_options += " -DCK_DONT_USE_HIP_RUNTIME_HEADERS"; kernel.comp_options += " -DCK_CODE_GEN_RTC"; - std::cout << "comp options: " << kernel.comp_options << std::endl; soln.construction_params.push_back(kernel); From 6ab9d7265804c712c51d55623c47fca6ff7bf9fc Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Mon, 28 Oct 2024 21:19:27 +0000 Subject: [PATCH 47/69] added in gtest for codegen --- test/gtest/codegen_group_conv2d_fwd.cpp | 32 +++++++++++++++++ test/gtest/group_conv.hpp | 48 +++++++++++++++---------- 2 files changed, 61 insertions(+), 19 deletions(-) create mode 100644 test/gtest/codegen_group_conv2d_fwd.cpp diff --git a/test/gtest/codegen_group_conv2d_fwd.cpp b/test/gtest/codegen_group_conv2d_fwd.cpp new file mode 100644 index 0000000000..04b504b619 --- /dev/null +++ b/test/gtest/codegen_group_conv2d_fwd.cpp @@ -0,0 +1,32 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#include + +#include "group_conv.hpp" + +using namespace group_conv; + +DEFINE_CG_GROUP_CONV2D_TEST(half, FP16, Forward); diff --git a/test/gtest/group_conv.hpp b/test/gtest/group_conv.hpp index 5b2b26028c..8206fce0d3 100644 --- a/test/gtest/group_conv.hpp +++ b/test/gtest/group_conv.hpp @@ -95,14 +95,21 @@ struct GroupConvTestConfig<2u> 1.0}; } - template + template static std::vector GetConfigs() { - - if constexpr(DIR == Direction::Forward) + if(cg == true) { + return { + {1, 256, 192, 192, {28, 28}, {3, 3}, {1, 1}, {1, 1}, {1, 1}}, + }; + } + else + { + if constexpr(DIR == Direction::Forward) + { - // clang-format off + // clang-format off return { // g n C K img filter pad stride dilation {1 , 256, 192 , 192 , {28, 28}, {3, 3}, {1, 1}, {1, 1}, {1, 1}}, @@ -112,11 +119,11 @@ struct GroupConvTestConfig<2u> {8 , 256, 384 , 384 , {28, 28}, {3, 3}, {1, 1}, {1, 1}, {1, 1}}, {32, 256, 1024, 2048, {28, 28}, {3, 3}, {1, 1}, {1, 1}, {1, 1}}, }; - // clang-format on - } - else if constexpr(DIR == Direction::BackwardData || DIR == Direction::BackwardWeights) - { - // clang-format off + // clang-format on + } + else if constexpr(DIR == Direction::BackwardData || DIR == Direction::BackwardWeights) + { + // clang-format off return { // g n C K img filter pad stride dilation {1 , 1 , 1 , 1 , {28, 28}, {3, 3}, {1, 1}, {1, 1}, {1, 1}}, @@ -126,11 +133,12 @@ struct GroupConvTestConfig<2u> {8 , 256, 384 , 384 , {28, 28}, {2, 2}, {1, 1}, {1, 1}, {1, 1}}, {32, 256, 1024, 2048, {28, 28}, {3, 3}, {1, 1}, {1, 1}, {1, 1}}, }; - // clang-format on - } - else - { - std::abort(); + // clang-format on + } + else + { + std::abort(); + } } } }; @@ -191,7 +199,7 @@ struct GroupConvTestConfig<3u> 1.0}; } - template + template static std::vector GetConfigs() { @@ -569,7 +577,7 @@ std::vector GetBetaValues() } } -#define DEFINE_GROUP_CONV_TEST(ndim, type, naming_type, dir) \ +#define DEFINE_GROUP_CONV_TEST(ndim, type, naming_type, dir, cg) \ struct GPU_GroupConv##ndim##D_##dir##_##naming_type \ : GroupConvTestFix \ { \ @@ -582,12 +590,14 @@ std::vector GetBetaValues() Full, \ GPU_GroupConv##ndim##D_##dir##_##naming_type, \ testing::Combine( \ - testing::ValuesIn(GroupConvTestConfig::GetConfigs()), \ + testing::ValuesIn(GroupConvTestConfig::GetConfigs()), \ testing::ValuesIn(GetAlphaValues()), \ testing::ValuesIn(GetBetaValues()), \ testing::ValuesIn(GetLayoutValues()))); #define DEFINE_GROUP_CONV2D_TEST(type, naming_type, dir) \ - DEFINE_GROUP_CONV_TEST(2, type, naming_type, dir) + DEFINE_GROUP_CONV_TEST(2, type, naming_type, dir, false) +#define DEFINE_CG_GROUP_CONV2D_TEST(type, naming_type, dir) \ + DEFINE_GROUP_CONV_TEST(2, type, naming_type, dir, true) #define DEFINE_GROUP_CONV3D_TEST(type, naming_type, dir) \ - DEFINE_GROUP_CONV_TEST(3, type, naming_type, dir) + DEFINE_GROUP_CONV_TEST(3, type, naming_type, dir, false) From 3d51f2c2e7149fa13758bed9367ceea3de75d9f3 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Thu, 14 Nov 2024 09:14:50 +0000 Subject: [PATCH 48/69] solver using AddKernels now, still have an issue with problem desc/cmd line arg --- ...ip_implicit_gemm_2d_grouped_fwd_xdlops.cpp | 121 ++++++++++++------ 1 file changed, 83 insertions(+), 38 deletions(-) diff --git a/src/solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp b/src/solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp index df3be22cfb..981d2eb716 100644 --- a/src/solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp +++ b/src/solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp @@ -47,11 +47,10 @@ #include "ck/tensor_operation/gpu/device/helper.hpp" #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" -//#include "common.hpp" #include #endif #include -MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS) +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS) namespace miopen { namespace solver { @@ -115,6 +114,15 @@ std::vector get_headers_for_test() }); return result; } +void write_buffer(const std::string& filename, const char* buffer, std::size_t size) +{ + std::ofstream os(filename); + os.write(buffer, size); +} +void write_string(const std::string& filename, const std::string_view& buffer) +{ + write_buffer(filename, buffer.data(), buffer.size()); +} struct CKArgs { @@ -213,27 +221,59 @@ bool ConvHipImplicitGemmGroupFwdXdlopsCodegen::IsApplicable( [[maybe_unused]] const ProblemDescription& problem) const { // FIXME: rewrite this function - return true; + std::cout << "####### entered isApplicable #######" << std::endl; + // return true; #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL - if(env::disabled(MIOPEN_DEBUG_3D_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS)) + std::cout << "----------- entered the header guard -----------" << std::endl; + if(env::disabled(MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS)) + { + std::cout << "Check 1: false" << std::endl; return false; - // check if type float else return false - if(problem.GetConv().attribute.deterministic) + } + if(problem.HasNonPackedTensors()) + { + std::cout << "Check 2: false" << std::endl; return false; + } if(!problem.AllTensorsDimsFitIntoInt()) + { + std::cout << "Check 3: false" << std::endl; + return false; + } + if(problem.IsTensorsCasted()) + { + std::cout << "Check 4: false" << std::endl; return false; + } + if(problem.GetConv().attribute.deterministic) + { + std::cout << "Check 5: false" << std::endl; + return false; + } if(problem.HasMixedDataTypes()) + { + std::cout << "Check 6: false" << std::endl; return false; + } if(!problem.IsDirectionForward()) + { + std::cout << "Check 7: false" << std::endl; return false; - if(!problem.Is3d()) + } + if(!problem.Is2d()) + { + std::cout << "Check 8: false" << std::endl; return false; + } if(!(problem.IsLayoutNHWC() || problem.IsLayoutDefault())) + { + std::cout << "Check 9: false" << std::endl; return false; - // needed because layout transpose kernel does not support non-packed tensors - if(problem.IsLayoutDefault() && problem.HasNonPackedTensors()) - return false; + } + std::cout << "------ went through header guard checks ------" << std::endl; + return true; #endif + std::cout << "never entered the header guard" << std::endl; return false; } @@ -243,10 +283,6 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( { auto x = CKArgs(problem); #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL - /**decltype(auto) conv = problem.GetConv(); - decltype(auto) in = problem.GetIn(); - decltype(auto) wei = problem.GetWeights(); - decltype(auto) out = problem.GetOut();**/ const auto workspace_req = GetWorkspaceSize(ctx, problem); @@ -262,21 +298,13 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( srcs.push_back({"main.cpp", src}); auto name = solution[0].GetTemplateParameter("name"); - auto kernel = KernelInfo{}; - kernel.kernel_file = srcs[srcs.size() - 1].path.filename().string(); - kernel.kernel_name = "run_" + name; - // rtc::compile_options options; - // auto name = solution[0].GetTemplateParameter("name"); - // options.kernel_name = "run_" + name; - // TODO: MIOpen has it's own handlers for compilation - // auto k = rtc::compile_kernel(srcs, options); - - /**auto pImpl = std::make_shared(); - pImpl->program = program_name; - pImpl->target = this->GetTargetProperties(); - auto p = HIPOCProgram{}; - p.impl = pImpl; - pImpl->BuildCodeObject(params, src);**/ + auto kernel_info = KernelInfo{}; + auto path = std::strcat(std::getenv("HOME"), "/workspace/MIOpen/src/kernels/main.cpp"); + // should write the generated code into the file + std::string path_name(path); + write_string(path_name, srcs[srcs.size() - 1].content); + kernel_info.kernel_file = path; + kernel_info.kernel_name = "run_" + name; // Grid size calculation auto block_size = solution[0].GetTemplateParameter("BlockSize"); @@ -285,23 +313,40 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( auto grid_size = tmp * x.in_lengths[1]; - kernel.l_wk = {block_size, 1, 1}; - kernel.g_wk = {block_size * grid_size, 1, 1}; + kernel_info.l_wk = {block_size, 1, 1}; + kernel_info.g_wk = {block_size * grid_size, 1, 1}; bool bfp16parm = true; const auto build_params = KernelBuildParameters{{"MIOPEN_USE_FP16", static_cast(bfp16parm)}}; - kernel.comp_options = build_params.GenerateFor(kbp::HIP{}); - kernel.comp_options += " -DCK_DONT_USE_HIP_RUNTIME_HEADERS"; - kernel.comp_options += " -DCK_CODE_GEN_RTC"; - - soln.construction_params.push_back(kernel); + kernel_info.comp_options = build_params.GenerateFor(kbp::HIP{}); + kernel_info.comp_options += " -DCK_DONT_USE_HIP_RUNTIME_HEADERS"; + kernel_info.comp_options += " -DCK_CODE_GEN_RTC"; + // soln.construction_params.push_back(kernel_info); + std::cout << "============== does it get here? =============" << std::endl; soln.invoker_factory = [=](const std::vector& kernels) { + std::cout << " ------------- outer lambda --------------------" << std::endl; return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { - decltype(auto) kernel = handle_.Run(kernels.front()); + std::cout << "----------- into inner lambda -----------" << std::endl; + // decltype(auto) kernel = handle_.Run(kernels.front()); decltype(auto) params = raw_params.CastTo(); - + std::cout << " ------------- past param assignment -------------" << std::endl; + + std::cout << "=========== invoker factory ===============" << std::endl; + std::cout << "name of kernel: " << name << std::endl; + auto kernel = handle_.AddKernel("tmp", + "tmp", + "cg_main.cpp", + kernel_info.kernel_name, + kernel_info.l_wk, + kernel_info.g_wk, + kernel_info.comp_options, + 0, + src); + std::cout << "in: " << *params.tensors.in << std::endl; + std::cout << "w: " << *params.tensors.w << std::endl; + std::cout << "out: " << *params.tensors.out << std::endl; kernel(params.tensors.in, params.tensors.w, params.tensors.out, From 6481117fa6c631178c1298a48fe0b22eb43f7c77 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Thu, 14 Nov 2024 09:15:11 +0000 Subject: [PATCH 49/69] updting CK commit hash to get new changes --- Dockerfile | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index e52011b862..366b9fb033 100755 --- a/Dockerfile +++ b/Dockerfile @@ -114,7 +114,7 @@ DEBIAN_FRONTEND=noninteractive apt-get purge -y --allow-unauthenticated \ miopen-hip # TODO: it should be able to automatically get commit hash from requirements.txt -ARG CK_COMMIT=7d3ee2660b10b3c95f944655a372f53ab16667cd +ARG CK_COMMIT=c7b0b6e770f66a1bc3ee4524945c2519a1f978c3 RUN wget -O ck.tar.gz https://www.github.com/ROCm/composable_kernel/archive/${CK_COMMIT}.tar.gz && \ tar zxvf ck.tar.gz &&\ cd composable_kernel-${CK_COMMIT} && \ diff --git a/requirements.txt b/requirements.txt index fa41db58d5..ce68a578a4 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@7d3ee2660b10b3c95f944655a372f53ab16667cd -DCMAKE_BUILD_TYPE=Release -DCK_USE_CODEGEN=ON +ROCm/composable_kernel@c7b0b6e770f66a1bc3ee4524945c2519a1f978c3 -DCMAKE_BUILD_TYPE=Release -DCK_USE_CODEGEN=ON google/googletest@v1.14.0 From a2c7ce5736e3c311f71b91c74320796aa244d06a Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Thu, 14 Nov 2024 09:25:04 +0000 Subject: [PATCH 50/69] fixing build command for codegen --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ce68a578a4..65bcccf1ea 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@c7b0b6e770f66a1bc3ee4524945c2519a1f978c3 -DCMAKE_BUILD_TYPE=Release -DCK_USE_CODEGEN=ON +codegen,ROCm/composable_kernel@${c7b0b6e770f66a1bc3ee4524945c2519a1f978c3} -X subdir -DCMAKE_DIR=codegen -DCMAKE_BUILD_TYPE=Release -DCK_USE_CODEGEN=ON google/googletest@v1.14.0 From 2df0c6c63c66d36091bcc826cf7601a7c4749a65 Mon Sep 17 00:00:00 2001 From: Astha Date: Wed, 27 Nov 2024 09:24:49 -0500 Subject: [PATCH 51/69] updated CK commit hash --- Dockerfile | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index e52011b862..22fade64a2 100755 --- a/Dockerfile +++ b/Dockerfile @@ -114,7 +114,7 @@ DEBIAN_FRONTEND=noninteractive apt-get purge -y --allow-unauthenticated \ miopen-hip # TODO: it should be able to automatically get commit hash from requirements.txt -ARG CK_COMMIT=7d3ee2660b10b3c95f944655a372f53ab16667cd +ARG CK_COMMIT=60afb5221962064f05608c1e87482ca93afabf54 RUN wget -O ck.tar.gz https://www.github.com/ROCm/composable_kernel/archive/${CK_COMMIT}.tar.gz && \ tar zxvf ck.tar.gz &&\ cd composable_kernel-${CK_COMMIT} && \ diff --git a/requirements.txt b/requirements.txt index fa41db58d5..a8716b2d73 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@7d3ee2660b10b3c95f944655a372f53ab16667cd -DCMAKE_BUILD_TYPE=Release -DCK_USE_CODEGEN=ON +ROCm/composable_kernel@60afb5221962064f05608c1e87482ca93afabf54 -DCMAKE_BUILD_TYPE=Release -DCK_USE_CODEGEN=ON google/googletest@v1.14.0 From c8340fb9bd0b42b7030d07bc7643b0bec968e9dc Mon Sep 17 00:00:00 2001 From: Astha Date: Wed, 27 Nov 2024 13:15:55 -0500 Subject: [PATCH 52/69] temporariy removing codegen from CMake --- src/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 507215a45a..643814453a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -847,7 +847,7 @@ target_include_directories(MIOpen PUBLIC ) if(MIOPEN_USE_COMPOSABLEKERNEL) -set(MIOPEN_CK_LINK_FLAGS composable_kernel::device_other_operations composable_kernel::device_gemm_operations composable_kernel::device_conv_operations composable_kernel::device_reduction_operations composable_kernel::ck_host hip::host) +set(MIOPEN_CK_LINK_FLAGS composable_kernel::device_other_operations composable_kernel::device_gemm_operations composable_kernel::device_conv_operations composable_kernel::device_reduction_operations hip::host) endif() if(WIN32) From 5f90f89d99584ae6681b9e48e79fd6a2984f4ddf Mon Sep 17 00:00:00 2001 From: Astha Date: Wed, 27 Nov 2024 14:12:46 -0500 Subject: [PATCH 53/69] updated Dockerfile --- Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 22fade64a2..a97072827d 100755 --- a/Dockerfile +++ b/Dockerfile @@ -125,7 +125,6 @@ RUN wget -O ck.tar.gz https://www.github.com/ROCm/composable_kernel/archive/${CK -D CMAKE_BUILD_TYPE=Release \ -D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx1100" \ -D CMAKE_CXX_FLAGS=" -O3 " \ - -DCK_USE_CODEGEN=ON .. && \ make -j $(nproc) install # Composable Kernel installed separated from rbuild to take in values from GPU_ARCHS From 3b05faade262f49a97133bbb018bebe1d97e8399 Mon Sep 17 00:00:00 2001 From: Astha Date: Wed, 27 Nov 2024 21:47:09 -0500 Subject: [PATCH 54/69] fixed error in the Dockerfile --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index a97072827d..65a898370e 100755 --- a/Dockerfile +++ b/Dockerfile @@ -124,7 +124,7 @@ RUN wget -O ck.tar.gz https://www.github.com/ROCm/composable_kernel/archive/${CK -D CMAKE_CXX_COMPILER_LAUNCHER="${COMPILER_LAUNCHER}" \ -D CMAKE_BUILD_TYPE=Release \ -D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx1100" \ - -D CMAKE_CXX_FLAGS=" -O3 " \ + -D CMAKE_CXX_FLAGS=" -O3 " .. && \ make -j $(nproc) install # Composable Kernel installed separated from rbuild to take in values from GPU_ARCHS From f0641f1675960e059babded3da9fd534fe0554db Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Thu, 28 Nov 2024 07:51:19 +0000 Subject: [PATCH 55/69] removed the hardcoded generated file I added in kernels directory for testing --- src/CMakeLists.txt | 3 +- src/kernels/main.cpp | 152 ------------------------------------------- 2 files changed, 1 insertion(+), 154 deletions(-) delete mode 100644 src/kernels/main.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 507215a45a..5678eb289e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -637,8 +637,7 @@ if( MIOPEN_BACKEND MATCHES "OpenCL" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN kernels/xform_bidirect_winograd_data.s kernels/xform_bidirect_winograd_filter.s kernels/xform_bidirect_winograd_out.s - kernels/UniversalTranspose.cl - kernels/main.cpp) + kernels/UniversalTranspose.cl) # Kernels in development lists. # Should be ALWAYS empty in develop branch (at the time of PR merge) diff --git a/src/kernels/main.cpp b/src/kernels/main.cpp deleted file mode 100644 index cef2b8c15d..0000000000 --- a/src/kernels/main.cpp +++ /dev/null @@ -1,152 +0,0 @@ -#include - -struct Epilogue -{ - __host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){}; - - template - __host__ __device__ constexpr void operator()(E& e, const D& d) const; - - template <> - __host__ __device__ constexpr void operator()(ck::half_t& e, - const ck::half_t& d) const - { - e = ck::type_convert(alpha_ * e + beta_ * ck::type_convert(d)); - } - - float alpha_; - float beta_; -}; - -using CDEElementOp = Epilogue; -using DeviceConv = - ck::tensor_operation::device::CodegenDeviceGroupedConvFwdMultipleABD_Xdl_CShuffle< - 2, - ck::tensor_layout::convolution::NHWGC, - ck::tensor_layout::convolution::GKYXC, - ck::Tuple<>, - ck::tensor_layout::convolution::NHWGK, - ck::half_t, - ck::half_t, - float, - ck::half_t, - ck::Tuple<>, - ck::half_t, - ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::PassThrough, - CDEElementOp, - ck::tensor_operation::device::ConvolutionForwardSpecialization::Default, - ck::tensor_operation::device::GemmSpecialization::MNKPadding, - 1, - 64, - 64, - 32, - 32, - 8, - 8, - 32, - 32, - 2, - 1, - ck::Sequence<4, 16, 1>, - ck::Sequence<1, 0, 2>, - ck::Sequence<1, 0, 2>, - 2, - 8, - 8, - 1, - ck::Sequence<4, 16, 1>, - ck::Sequence<1, 0, 2>, - ck::Sequence<1, 0, 2>, - 2, - 8, - 8, - 1, - 1, - 1, - ck::Sequence<1, 16, 1, 4>, - 1>; - -constexpr ck::index_t NumATensor = - ck::tensor_operation::device::GetNumABTensors(); -constexpr ck::index_t NumBTensor = - ck::tensor_operation::device::GetNumABTensors(); - -extern "C" __global__ void -run_64_64_32_32_8_8_32_32_2_1(const ck::half_t* in_dev, - const ck::half_t* wei_dev, - ck::half_t* __restrict__ out_dev, - ck::Array in_lengths, - ck::Array in_strides, - ck::Array wei_lengths, - ck::Array wei_strides, - ck::Array out_lengths, - ck::Array out_strides, - ck::Array conv_filter_strides, - ck::Array conv_filter_dilations, - ck::Array input_left_pads, - ck::Array input_right_pads, - const ck::tensor_operation::element_wise::PassThrough a_element_op, - const ck::tensor_operation::element_wise::PassThrough b_element_op, - const CDEElementOp cde_element_op) -{ - - auto arg = DeviceConv::Argument(in_dev, - wei_dev, - ck::Array{}, - out_dev, - in_lengths, - in_strides, - wei_lengths, - wei_strides, - ck::Array, 0>{}, - ck::Array, 0>{}, - out_lengths, - out_strides, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, - ck::tensor_operation::element_wise::PassThrough{}, - ck::tensor_operation::element_wise::PassThrough{}, - CDEElementOp{1.0f, 1.0f}); - - constexpr ck::LoopScheduler LoopSched = ck::make_default_loop_scheduler(); - - // GridwiseGemm - using GridwiseGemm = DeviceConv::GridwiseGemm; - - static constexpr auto I0 = ck::Number<0>{}; - - ck::tensor_operation::device::device_grouped_conv_fwd_multiple_abd_xdl_cshuffle< - GridwiseGemm, - const ck::half_t*, - const ck::half_t*, - typename GridwiseGemm::DsGridPointer, - ck::half_t, - ck::tensor_operation::element_wise::PassThrough, - ck::tensor_operation::element_wise::PassThrough, - CDEElementOp, - DeviceConv::AGridDesc_AK0_M_AK1, - DeviceConv::BGridDesc_BK0_N_BK1, - DeviceConv::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock, - DeviceConv::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock, - DeviceConv::Block2ETileMap, - ck::tensor_operation::device::ComputePtrOffsetOfStridedBatch, - ck::integral_constant{}, - false, - false>(arg.p_as_grid_.At(I0), - arg.p_bs_grid_.At(I0), - arg.p_ds_grid_, - arg.p_e_grid_, - arg.a_element_op_, - arg.b_element_op_, - arg.cde_element_op_, - arg.a_g_n_c_wis_lengths_[0], // Group count - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.e_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.block_2_etile_map_, - arg.compute_ptr_offset_of_batch_); -}; From a96e0d9506a593f2eecc5350e3141113dc844dc6 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Thu, 28 Nov 2024 07:53:58 +0000 Subject: [PATCH 56/69] removed version using KernelInfo for compilation, switched to and finished debugging AddKernels version - solver is now running --- ...ip_implicit_gemm_2d_grouped_fwd_xdlops.cpp | 97 ++++++++++++------- 1 file changed, 63 insertions(+), 34 deletions(-) diff --git a/src/solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp b/src/solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp index 981d2eb716..429b7842b3 100644 --- a/src/solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp +++ b/src/solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp @@ -59,18 +59,10 @@ namespace conv { using ProblemDescription = miopen::conv::ProblemDescription; #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL - -using InLayout = ck::tensor_layout::convolution::NDHWGC; -using WeiLayout = ck::tensor_layout::convolution::GKZYXC; -using OutLayout = ck::tensor_layout::convolution::NDHWGK; -using PassThrough = ck::tensor_operation::element_wise::PassThrough; -using Bilinear = ck::tensor_operation::element_wise::Bilinear; -using Scale = ck::tensor_operation::element_wise::Scale; static constexpr ck::index_t NumDimSpatial = 2; const std::string conv_compile_check = R"__ck__( #include <${include}> - ${template}; )__ck__"; @@ -114,15 +106,6 @@ std::vector get_headers_for_test() }); return result; } -void write_buffer(const std::string& filename, const char* buffer, std::size_t size) -{ - std::ofstream os(filename); - os.write(buffer, size); -} -void write_string(const std::string& filename, const std::string_view& buffer) -{ - write_buffer(filename, buffer.data(), buffer.size()); -} struct CKArgs { @@ -132,8 +115,8 @@ struct CKArgs prob.NumDim = NumDimSpatial; prob.G = ProblemInterpreter::GetGroupCountG(problem); prob.N = ProblemInterpreter::GetBatchN(problem); - int K1 = ProblemInterpreter::GetOutputChannelK(problem); - int C1 = ProblemInterpreter::GetInputChannelC(problem); + int K1 = prob.G * ProblemInterpreter::GetOutputChannelK(problem); + int C1 = prob.G * ProblemInterpreter::GetInputChannelC(problem); prob.C = C1 / prob.G; // Number of input Channel per group prob.K = K1 / prob.G; // Number of output Channel per group prob.Y = ProblemInterpreter::GetFilterHeightY(problem); @@ -146,30 +129,45 @@ struct CKArgs in_lengths = {prob.G, prob.N, prob.C, prob.Hi, prob.Wi}; out_lengths = {prob.G, prob.N, prob.K, prob.Ho, prob.Wo}; wei_lengths = {prob.G, prob.K, prob.C, prob.Y, prob.X}; - - in_strides = {prob.C, + std::cout << "in lengths: " << prob.G << ", " << prob.N << ", " << prob.C << ", " << prob.Hi + << ", " << prob.Wi << std::endl; + std::cout << "weight lengths: " << prob.G << ", " << prob.K << ", " << prob.C << ", " + << prob.Y << ", " << prob.X << std::endl; + std::cout << "out lengths: " << prob.G << ", " << prob.N << ", " << prob.K << ", " + << prob.Ho << ", " << prob.Wo << std::endl; + + in_strides = {prob.C, prob.Hi * prob.Wi * prob.G * prob.C, 1, prob.Wi * prob.G * prob.C, prob.G * prob.C}; - out_strides = {prob.K, + out_strides = {prob.K, prob.Ho * prob.Wo * prob.G * prob.K, 1, prob.Wo * prob.G * prob.K, prob.G * prob.K}; - wei_strides = {prob.K * prob.Y * prob.X * prob.C, + wei_strides = {prob.K * prob.Y * prob.X * prob.C, prob.Y * prob.X * prob.C, 1, prob.X * prob.C, prob.C}; + std::cout << "in strides: " << prob.C << ", " << prob.Hi * prob.Wi * prob.G * prob.C + << ", 1, " << prob.Wi * prob.G * prob.C << ", " << prob.G * prob.C << std::endl; + std::cout << "wei strides: " << prob.K * prob.Y * prob.X * prob.C << ", " + << prob.Y * prob.X * prob.C << ", 1, " << prob.X * prob.C << ", " << prob.C + << std::endl; + std::cout << "out strides: " << prob.K << ", " << prob.Ho * prob.Wo * prob.G * prob.K + << ", 1, " << prob.Wo * prob.G * prob.K << ", " << prob.G * prob.K << std::endl; + filter_strides = {ProblemInterpreter::GetAdjustedConvolutionStrideH(problem), ProblemInterpreter::GetAdjustedConvolutionStrideW(problem)}; filter_dilations = {ProblemInterpreter::GetAdjustedConvolutionDilationH(problem), ProblemInterpreter::GetAdjustedConvolutionDilationW(problem)}; lPadding = {ProblemInterpreter::GetInputLeftPadH(problem), ProblemInterpreter::GetInputLeftPadW(problem)}; - rPadding = {ProblemInterpreter::GetAdjustedInputRightPadH(problem), - ProblemInterpreter::GetAdjustedInputRightPadW(problem)}; + // rPadding = {ProblemInterpreter::GetAdjustedInputRightPadH(problem), + // ProblemInterpreter::GetAdjustedInputRightPadW(problem)}; + rPadding = {1, 1}; } CKArgs(const CKArgs&) = default; @@ -300,9 +298,6 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( auto kernel_info = KernelInfo{}; auto path = std::strcat(std::getenv("HOME"), "/workspace/MIOpen/src/kernels/main.cpp"); - // should write the generated code into the file - std::string path_name(path); - write_string(path_name, srcs[srcs.size() - 1].content); kernel_info.kernel_file = path; kernel_info.kernel_name = "run_" + name; @@ -312,10 +307,13 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( auto tmp = get_launch_params(solution[0], x.out_lengths, x.out_strides); auto grid_size = tmp * x.in_lengths[1]; + std::cout << " ------- grid size: " << grid_size << std::endl; - kernel_info.l_wk = {block_size, 1, 1}; - kernel_info.g_wk = {block_size * grid_size, 1, 1}; + kernel_info.l_wk = {256, 1, 1}; + kernel_info.g_wk = {16384, 1, 1}; + std::cout << "block size: " << block_size << ", grid size: " << grid_size + << ", launch: " << block_size * grid_size << std::endl; bool bfp16parm = true; const auto build_params = KernelBuildParameters{{"MIOPEN_USE_FP16", static_cast(bfp16parm)}}; @@ -324,7 +322,6 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( kernel_info.comp_options += " -DCK_CODE_GEN_RTC"; // soln.construction_params.push_back(kernel_info); - std::cout << "============== does it get here? =============" << std::endl; soln.invoker_factory = [=](const std::vector& kernels) { std::cout << " ------------- outer lambda --------------------" << std::endl; return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { @@ -335,6 +332,8 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( std::cout << "=========== invoker factory ===============" << std::endl; std::cout << "name of kernel: " << name << std::endl; + std::cout << "block size: " << block_size << ", grid size: " << grid_size + << ", launch: " << block_size * grid_size << std::endl; auto kernel = handle_.AddKernel("tmp", "tmp", "cg_main.cpp", @@ -344,9 +343,39 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( kernel_info.comp_options, 0, src); - std::cout << "in: " << *params.tensors.in << std::endl; - std::cout << "w: " << *params.tensors.w << std::endl; - std::cout << "out: " << *params.tensors.out << std::endl; + std::cout << "in: " << params.tensors.inDesc << std::endl; + std::cout << "lens: " << params.tensors.inDesc.GetLengths().size() << std::endl; + // std::cout << "w: " << *params.tensors.w << std::endl; + // std::cout << "out: " << *params.tensors.out << std::endl; + std::cout << "conv: " << problem.GetConv() << std::endl; + std::cout << "in: " << problem.GetIn() << std::endl; + std::cout << "w: " << problem.GetWeights() << std::endl; + std::cout << "out: " << problem.GetOut() << std::endl; + std::cout << "in lengths: " << x.in_lengths[0] << ", " << x.in_lengths[1] << ", " + << x.in_lengths[2] << ", " << x.in_lengths[3] << ", " << x.in_lengths[4] + << ", " << std::endl; + std::cout << "w lengths: " << x.wei_lengths[0] << ", " << x.wei_lengths[1] << ", " + << x.wei_lengths[2] << ", " << x.wei_lengths[3] << ", " << x.wei_lengths[4] + << ", " << std::endl; + std::cout << "out lengths: " << x.out_lengths[0] << ", " << x.out_lengths[1] << ", " + << x.out_lengths[2] << ", " << x.out_lengths[3] << ", " << x.out_lengths[4] + << ", " << std::endl; + std::cout << "in strides: " << x.in_strides[0] << ", " << x.in_strides[1] << ", " + << x.in_strides[2] << ", " << x.in_strides[3] << ", " << x.in_strides[4] + << ", " << std::endl; + std::cout << "wei strides: " << x.wei_strides[0] << ", " << x.wei_strides[1] << ", " + << x.wei_strides[2] << ", " << x.wei_strides[3] << ", " << x.wei_strides[4] + << ", " << std::endl; + std::cout << "out strides: " << x.out_strides[0] << ", " << x.out_strides[1] << ", " + << x.out_strides[2] << ", " << x.out_strides[3] << ", " << x.out_strides[4] + << ", " << std::endl; + std::cout << "filter strides: " << x.filter_strides[0] << ", " << x.filter_strides[1] + << std::endl; + std::cout << "filter dilations: " << x.filter_dilations[0] << ", " + << x.filter_dilations[1] << std::endl; + std::cout << "left pad: " << x.lPadding[0] << ", " << x.lPadding[1] << std::endl; + std::cout << "right pad: " << x.rPadding[0] << ", " << x.rPadding[1] << std::endl; + kernel(params.tensors.in, params.tensors.w, params.tensors.out, From 74924459ce3c2729b05938eb0159dd8cbd896a51 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Thu, 28 Nov 2024 07:57:14 +0000 Subject: [PATCH 57/69] Fixed gtest. debugged error with mapping CK problem description to MIOpen as well as the corresponding command line argument (right padding is hardcoded for now). Was having memory access issues that are currently resolved due to hardcoding of block size/grid size - need to figure out how this calculation differs from CK. With this version both the solver and the gtest are running (with the codegen packages). Need to integrate standalone codegen build --- test/gtest/codegen_group_conv.hpp | 567 ++++++++++++++++++++++++ test/gtest/codegen_group_conv2d_fwd.cpp | 5 +- 2 files changed, 570 insertions(+), 2 deletions(-) create mode 100644 test/gtest/codegen_group_conv.hpp diff --git a/test/gtest/codegen_group_conv.hpp b/test/gtest/codegen_group_conv.hpp new file mode 100644 index 0000000000..90f588b5d5 --- /dev/null +++ b/test/gtest/codegen_group_conv.hpp @@ -0,0 +1,567 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2023 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ +#pragma once + +#include "../random.hpp" + +#include "get_handle.hpp" +#include +#include +#include + +#include "../driver/tensor_driver.hpp" +#include "conv_common.hpp" + +namespace codegen_group_conv { + +using Direction = miopen::conv::Direction; + +template +struct GroupConvTestConfig +{ +}; + +template <> +struct GroupConvTestConfig<2u> +{ + + struct Size2D + { + size_t y; + size_t x; + }; + + size_t G; + size_t N; + size_t C; + size_t K; + + Size2D img; + Size2D filter; + Size2D pad; + Size2D stride; + Size2D dilation; + + friend std::ostream& operator<<(std::ostream& os, const GroupConvTestConfig& tc) + { + return os << " G:" << tc.G << " N:" << tc.N << " C:" << tc.C << " K:" << tc.K + << " H:" << tc.img.y << " W:" << tc.img.x << " y:" << tc.filter.y + << " x:" << tc.filter.x << " pad.y:" << tc.pad.y << " pad.x:" << tc.pad.x + << " stride.y:" << tc.stride.y << "stride.x" << tc.stride.x + << " dilation.y:" << tc.dilation.y << " dilation.x" << tc.dilation.x; + } + + std::vector GetInput() { return {N, C, img.y, img.x}; } + std::vector GetWeights() + { + EXPECT_EQUAL(C % G, 0); + return {K, C / G, filter.y, filter.x}; + } + + miopen::ConvolutionDescriptor GetConv() + { + return miopen::ConvolutionDescriptor{ + 2, + miopenConvolution, + miopenPaddingDefault, + {static_cast(pad.y), static_cast(pad.x)}, + {static_cast(stride.y), static_cast(stride.x)}, + {static_cast(dilation.y), static_cast(dilation.x)}, + {0, 0}, + static_cast(G), + 1.0}; + } + + template + static std::vector GetConfigs() + { + if constexpr(cg == true) + { + return { + {32, 256, 32, 64, {28, 28}, {3, 3}, {1, 1}, {1, 1}, {1, 1}}, + }; + } + else + { + if constexpr(DIR == Direction::Forward) + { + + // clang-format off + return { + // g n C K img filter pad stride dilation + {1 , 256, 192 , 192 , {28, 28}, {3, 3}, {1, 1}, {1, 1}, {1, 1}}, + {1 , 256, 12 , 12 , {28, 28}, {3, 3}, {1, 1}, {1, 1}, {1, 1}}, + {4 , 256, 192 , 192 , {28, 28}, {3, 3}, {1, 1}, {1, 1}, {1, 1}}, + {8 , 256, 192 , 192 , {28, 28}, {3, 3}, {1, 1}, {1, 1}, {1, 1}}, + {8 , 256, 384 , 384 , {28, 28}, {3, 3}, {1, 1}, {1, 1}, {1, 1}}, + {32, 256, 1024, 2048, {28, 28}, {3, 3}, {1, 1}, {1, 1}, {1, 1}}, + }; + // clang-format on + } + else if constexpr(DIR == Direction::BackwardData || DIR == Direction::BackwardWeights) + { + // clang-format off + return { + // g n C K img filter pad stride dilation + {1 , 1 , 1 , 1 , {28, 28}, {3, 3}, {1, 1}, {1, 1}, {1, 1}}, + {1 , 1 , 4 , 4 , {28, 28}, {2, 2}, {1, 1}, {1, 1}, {1, 1}}, + {1 , 1 , 1 , 1 , {8 , 8 }, {2, 2}, {0, 0}, {1, 1}, {1, 1}}, + {8 , 256, 192 , 192 , {28, 28}, {3, 3}, {1, 1}, {1, 1}, {1, 1}}, + {8 , 256, 384 , 384 , {28, 28}, {2, 2}, {1, 1}, {1, 1}, {1, 1}}, + {32, 256, 1024, 2048, {28, 28}, {3, 3}, {1, 1}, {1, 1}, {1, 1}}, + }; + // clang-format on + } + else + { + std::abort(); + } + } + } +}; + +template <> +struct GroupConvTestConfig<3u> +{ + + struct Size3D + { + size_t z; + size_t y; + size_t x; + }; + + size_t G; + size_t N; + size_t C; + size_t K; + + Size3D img; + Size3D filter; + Size3D pad; + Size3D stride; + Size3D dilation; + + friend std::ostream& operator<<(std::ostream& os, const GroupConvTestConfig<3u>& tc) + { + return os << " G:" << tc.G << " N:" << tc.N << " C:" << tc.C << " K:" << tc.K + << " D:" << tc.img.z << " H:" << tc.img.y << " W:" << tc.img.x + << " z:" << tc.filter.z << " y:" << tc.filter.y << " x:" << tc.filter.x + << " pad.z:" << tc.pad.z << " pad.y:" << tc.pad.y << " pad.x:" << tc.pad.x + << " stride.z:" << tc.stride.z << " stride.y:" << tc.stride.y + << " stride.x:" << tc.stride.x << " dilation.z:" << tc.dilation.z + << " dilation.y:" << tc.dilation.y << " dilation.x:" << tc.dilation.x; + } + + std::vector GetInput() { return {N, C, img.z, img.y, img.x}; } + std::vector GetWeights() + { + EXPECT_EQUAL(C % G, 0); + return {K, C / G, filter.z, filter.y, filter.x}; + } + + miopen::ConvolutionDescriptor GetConv() + { + return miopen::ConvolutionDescriptor{ + 3, + miopenConvolution, + miopenPaddingDefault, + {static_cast(pad.z), static_cast(pad.y), static_cast(pad.x)}, + {static_cast(stride.z), static_cast(stride.y), static_cast(stride.x)}, + {static_cast(dilation.z), + static_cast(dilation.y), + static_cast(dilation.x)}, + {0, 0, 0}, + static_cast(G), + 1.0}; + } + + template + static std::vector GetConfigs() + { + + if constexpr(DIR == Direction::Forward) + { + // clang-format off + return { + // g n C K img filter pad stride dilation + {1 , 128, 64, 64, {14, 28, 28}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}, + {1 , 64 , 32, 32, {28, 28, 28}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}, + {2 , 128, 32, 32, {28, 28, 28}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}, + {8 , 128, 32, 32, {28, 28, 28}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}, + {2 , 128, 32, 32, {28, 28, 28}, {3, 3, 3}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, + {8 , 64 , 32, 32, {28, 28, 28}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}, + {16, 64 , 32, 32, {28, 28, 28}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}, + {2 , 128, 32, 32, {28, 28, 28}, {3, 3, 3}, {0, 0, 0}, {2, 2, 2}, {1, 1, 1}}, + {8 , 64 , 32, 32, {28, 28, 28}, {3, 3, 3}, {1, 1, 1}, {2, 2, 2}, {1, 1, 1}}, + {16, 64 , 32, 32, {28, 28, 28}, {3, 3, 3}, {1, 1, 1}, {2, 2, 2}, {1, 1, 1}}, + {3 , 48 , 48, 48, {28, 28, 28}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}, + {3 , 48 , 39, 39, {28, 28, 28}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}, + {5 , 120, 60, 60, {28, 28, 28}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}, + }; + // clang-format on + } + else if constexpr(DIR == Direction::BackwardData || DIR == Direction::BackwardWeights) + { + // clang-format off + return { + // g n C K img filter pad stride dilation + {1, 1 , 4 , 4 ,{14, 28, 28}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}, + {1, 1 , 1 , 1 ,{4 , 4 , 4 }, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}, + {1, 1 , 1 , 1 ,{8 , 8 , 8 }, {2, 2, 2}, {0, 0, 0}, {1, 1, 1}, {1, 1, 1}}, + {1, 1 , 1 , 1 ,{8 , 8 , 8 }, {2, 2, 2}, {0, 0, 0}, {2, 2, 2}, {1, 1, 1}}, + {1, 64 , 32, 16,{28, 28, 28}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}, + {8, 128, 16, 32,{28, 28, 28}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}, + {8, 128, 16, 16,{28, 28, 28}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}, + {4, 128, 8 , 4 ,{28, 28, 28}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}, + {4, 128, 4 , 8 ,{28, 28, 28}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}, + {2, 128, 2 , 2 ,{28, 28, 28}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}, + }; + // clang-format on + } + else + { + std::abort(); + } + } +}; + +template +struct GroupConvTestFix + : public ::testing::TestWithParam< + std::tuple, float, float, miopenTensorLayout_t>> +{ + static_assert(NDIM == 2u || NDIM == 3u, "NDIM must be 2 for 2D Conv and 3 for 3D Conv"); + +private: + using Base = ::testing::TestWithParam< + std::tuple, float, float, miopenTensorLayout_t>>; + + template + void SetupFwd(F&& gen_value) + { + input.generate(gen_value); + weights.generate(gen_value); + std::fill(output.begin(), output.end(), T(0)); + } + + template + void SetupBwd(F&& gen_value) + { + output.generate(gen_value); + weights.generate(gen_value); + std::fill(input.begin(), input.end(), T(0)); + } + + template + void SetupWrw(FI&& gen_value_in, FO&& gen_value_out) + { + input.generate(gen_value_in); + output.generate(gen_value_out); + std::fill(weights.begin(), weights.end(), T{0}); + } + + void verify(const tensor& computed) + { + EXPECT_FALSE(miopen::range_zero(ref)) << "Cpu data is all zeros"; + EXPECT_FALSE(miopen::range_zero(computed)) << "Gpu data is all zeros"; + EXPECT_TRUE(miopen::range_distance(ref) == miopen::range_distance(computed)); + + /// \todo figure out a better threshold for error checking, esp. for bwd + /// data and weight passes. --amberhassaan + double threshold = 80; + if(CONV_DIR == Direction::Forward) + { + threshold *= std::numeric_limits::epsilon(); + } + else + { + threshold = 3.0e-3; + } + auto error = miopen::rms_range(ref, computed); + + EXPECT_FALSE(miopen::find_idx(ref, miopen::not_finite) >= 0) + << "Non finite number found in the reference output"; + + EXPECT_TRUE(error <= threshold) + << "Error beyond tolerance Error:" << error << ", Threshold: " << threshold; + } + + /// \todo had to pull out tensor and problem construction because the order of + /// tensors and tensor-descriptors varies by direction. Will move these + /// constructors back in this method once we have a uniform order of (x, w, y) + /// tensors everywhere. --amberhassaan + template + void RunSolverImpl(const ConvTensorsType& tensors, const ProblemDescription& problem) + { + + std::cout << conv_config << std::endl; + auto&& handle = get_handle(); + + // static_assert(cg == true, "Cg is false"); + + Solver solv{}; + + auto ctx = miopen::ExecutionContext{}; + + ctx.SetStream(&handle); + + if(!solv.IsApplicable(ctx, problem)) + { + test_skipped = true; + GTEST_SKIP() << solv.SolverDbId() << "Not Applicable for this problem" << conv_config; + } + + if(solv.MayNeedWorkspace()) + { + wspace.resize(solv.GetWorkspaceSize(ctx, problem)); + } + + const auto invoke_params = + InvokeParamType{tensors, wspace.ptr(), wspace.size(), false, alpha, beta}; + + ASSERT_TRUE(solv.IsApplicable(ctx, problem)); + // auto sol = solv.GetSolution(ctx, problem); + // if(cg == false){ solv::foo = 1;} + auto sol = solv.GetSolution(ctx, problem); + /**if constexpr(cg == false){ + sol = solv.GetSolution(ctx, problem, solv.GetDefaultPerformanceConfig(ctx, + problem)); + }**/ + ASSERT_TRUE(sol.Succeeded()); + ASSERT_TRUE(sol.invoker_factory); + const auto invoker = handle.PrepareInvoker(*sol.invoker_factory, sol.construction_params); + (invoker)(handle, invoke_params); + handle.Finish(); + } + + template + void DispatchSolver() + { + if constexpr(cg == true && CONV_DIR == Direction::Forward) + { + RunSolverImpl( + miopen::ConvDataTensors{input.desc, + in_dev.get(), + weights.desc, + wei_dev.get(), + output.desc, + out_dev.get()}, + miopen::conv::ProblemDescription{input.desc, + weights.desc, + output.desc, + conv_desc, + CONV_DIR, + 0 /*bias*/, + alpha, + beta}); + } + } + +public: + void RunSolver() + { + if constexpr(NDIM == 2u) + { + DispatchSolver(); + } + else + { + DispatchSolver(); + } + } + +protected: + void SetUp() override + { + float alpha_val; + float beta_val; + test_skipped = false; + std::tie(conv_config, alpha_val, beta_val, tensor_layout) = Base::GetParam(); + + alpha = miopen::Scalar(&alpha_val, miopenFloat); + beta = miopen::Scalar(&beta_val, miopenFloat); + + input = tensor{tensor_layout, conv_config.GetInput()}; + weights = tensor{tensor_layout, conv_config.GetWeights()}; + + conv_desc = conv_config.GetConv(); + + miopen::TensorDescriptor output_desc = + conv_desc.GetForwardOutputTensor(input.desc, weights.desc, miopen_type{}); + output = tensor{tensor_layout, output_desc.GetLengths()}; + + auto gen_value = [](auto...) { + return prng::gen_A_to_B(static_cast(-3.0), static_cast(3.0)); + }; + + if constexpr(CONV_DIR == Direction::Forward) + { + SetupFwd(gen_value); + } + else if constexpr(CONV_DIR == Direction::BackwardData) + { + SetupBwd(gen_value); + } + else + { + + // Half16 can store up to 16384. + // If we initialize tensor with 5 * number_of_accumulations in tensor + // this will cause over flow. Hence we pick smaller number, since + // our tests have tensor with large sizes. + auto gen_value_wrw_in = [](auto...) { + return prng::gen_A_to_B(static_cast(-0.1), static_cast(0.1)); + }; + auto gen_value_wrw_out = [](auto...) { + return prng::gen_A_to_B(static_cast(-0.01), static_cast(0.1)); + }; + static_assert(CONV_DIR == Direction::BackwardWeights); + // in and out are populated with different values. + SetupWrw(gen_value_wrw_in, gen_value_wrw_out); + } + + auto& handle = get_handle(); + in_dev = handle.Write(input.data); + wei_dev = handle.Write(weights.data); + out_dev = handle.Write(output.data); + } + + void TearDown() override + { + if(test_skipped) + return; + + auto& handle = get_handle(); + + if constexpr(CONV_DIR == Direction::Forward) + { + ref = ref_conv_fwd(input, weights, output, conv_desc, alpha, beta); + handle.ReadToVec(out_dev, output.data); + verify(output); + } + else if constexpr(CONV_DIR == Direction::BackwardData) + { + ref = ref_conv_bwd(input, weights, output, conv_desc, alpha, beta); + handle.ReadToVec(in_dev, input.data); + verify(input); + } + else + { + static_assert(CONV_DIR == Direction::BackwardWeights); + ref = ref_conv_wrw(input, weights, output, conv_desc, alpha, beta); + handle.ReadToVec(wei_dev, weights.data); + verify(weights); + } + } + + GroupConvTestConfig conv_config; + miopen::ConvolutionDescriptor conv_desc; + tensor input; + tensor weights; + tensor output; + tensor ref; + miopen::Allocator::ManageDataPtr in_dev; + miopen::Allocator::ManageDataPtr wei_dev; + miopen::Allocator::ManageDataPtr out_dev; + bool test_skipped = false; + miopenTensorLayout_t tensor_layout = miopenTensorNHWC; + Workspace wspace{}; + + miopen::Scalar alpha{1.0}; + miopen::Scalar beta{0.0}; +}; + +template +std::vector GetLayoutValues() +{ + static_assert(NDIM == 2u || NDIM == 3u); + if constexpr(NDIM == 2u) + { + return {miopenTensorNHWC, miopenTensorNCHW}; + } + else + { + return {miopenTensorNDHWC, miopenTensorNCDHW}; + } +} + +} // namespace codegen_group_conv + +// Test case based on 2d vs 3d dimension +// 2d conv only support alpha 1.0 +template +std::vector GetAlphaValues() +{ + if constexpr(ND == 3) + { + return {1.0f, 2.2f}; /* alpha, can't be zero*/ + } + else + { + return {1.0f}; + } +} + +// Test case based on 2d vs 3d dimension +// 2d conv only support beta 0.0 +template +std::vector GetBetaValues() +{ + if constexpr(ND == 3) + { + return {0.0f, 3.3f}; + } + else + { + return {0.0f}; + } +} + +#define DEFINE_GROUP_CONV_TEST(ndim, type, naming_type, dir, cg) \ + struct GPU_GroupConv##ndim##D_##dir##_##naming_type##_##cg \ + : GroupConvTestFix \ + { \ + }; \ + TEST_P(GPU_GroupConv##ndim##D_##dir##_##naming_type##_##cg, \ + GroupConv##ndim##D_##dir##_##type##_##cg##_Test) \ + { \ + RunSolver(); \ + } \ + INSTANTIATE_TEST_SUITE_P( \ + Full, \ + GPU_GroupConv##ndim##D_##dir##_##naming_type##_##cg, \ + testing::Combine( \ + testing::ValuesIn(GroupConvTestConfig::GetConfigs()), \ + testing::ValuesIn(GetAlphaValues()), \ + testing::ValuesIn(GetBetaValues()), \ + testing::ValuesIn(GetLayoutValues()))); + +#define DEFINE_CG_GROUP_CONV2D_TEST(type, naming_type, dir) \ + DEFINE_GROUP_CONV_TEST(2, type, naming_type, dir, true) diff --git a/test/gtest/codegen_group_conv2d_fwd.cpp b/test/gtest/codegen_group_conv2d_fwd.cpp index 04b504b619..859b41badf 100644 --- a/test/gtest/codegen_group_conv2d_fwd.cpp +++ b/test/gtest/codegen_group_conv2d_fwd.cpp @@ -25,8 +25,9 @@ *******************************************************************************/ #include -#include "group_conv.hpp" +//#include "group_conv.hpp" +#include "codegen_group_conv.hpp" -using namespace group_conv; +using namespace codegen_group_conv; DEFINE_CG_GROUP_CONV2D_TEST(half, FP16, Forward); From 0c7fb9ce118337733754d5daa7eed919347c915c Mon Sep 17 00:00:00 2001 From: Astha Date: Thu, 28 Nov 2024 13:01:41 -0500 Subject: [PATCH 58/69] added standalone codegen build into Dockerfilew --- Dockerfile | 6 ++++++ src/CMakeLists.txt | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 65a898370e..c326814312 100755 --- a/Dockerfile +++ b/Dockerfile @@ -127,6 +127,12 @@ RUN wget -O ck.tar.gz https://www.github.com/ROCm/composable_kernel/archive/${CK -D CMAKE_CXX_FLAGS=" -O3 " .. && \ make -j $(nproc) install +RUN rm -rf CMakeCache.txt &&\ + CXX=/opt/rocm/llvm/bin/clang++ cmake ../codegen \ + -D CMAKE_CXX_COMPILER_LAUNCHER="${COMPILER_LAUNCHER}" \ + -D CMAKE_BUILD_TYPE=Release \ + -D CMAKE_CXX_FLAGS=" -O3 " && \ + make -j $(nproc) install # Composable Kernel installed separated from rbuild to take in values from GPU_ARCHS # this can minimize build time RUN sed -i '/composable_kernel/d' /requirements.txt diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 0daf4cdeef..5678eb289e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -846,7 +846,7 @@ target_include_directories(MIOpen PUBLIC ) if(MIOPEN_USE_COMPOSABLEKERNEL) -set(MIOPEN_CK_LINK_FLAGS composable_kernel::device_other_operations composable_kernel::device_gemm_operations composable_kernel::device_conv_operations composable_kernel::device_reduction_operations hip::host) +set(MIOPEN_CK_LINK_FLAGS composable_kernel::device_other_operations composable_kernel::device_gemm_operations composable_kernel::device_conv_operations composable_kernel::device_reduction_operations composable_kernel::ck_host hip::host) endif() if(WIN32) From e5547b7ed2622afca1762d64602e4ede014abfcd Mon Sep 17 00:00:00 2001 From: Astha Date: Fri, 29 Nov 2024 02:35:31 -0500 Subject: [PATCH 59/69] updated codegen build in Dockerfile --- Dockerfile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index c326814312..f5cd2238d5 100755 --- a/Dockerfile +++ b/Dockerfile @@ -125,9 +125,8 @@ RUN wget -O ck.tar.gz https://www.github.com/ROCm/composable_kernel/archive/${CK -D CMAKE_BUILD_TYPE=Release \ -D GPU_ARCHS="gfx908;gfx90a;gfx942;gfx1100" \ -D CMAKE_CXX_FLAGS=" -O3 " .. && \ - make -j $(nproc) install - -RUN rm -rf CMakeCache.txt &&\ + make -j $(nproc) install && \ + rm -rf CMakeCache.txt &&\ CXX=/opt/rocm/llvm/bin/clang++ cmake ../codegen \ -D CMAKE_CXX_COMPILER_LAUNCHER="${COMPILER_LAUNCHER}" \ -D CMAKE_BUILD_TYPE=Release \ From 6d1a610fa5588c6f4a869bb9edf5ac6824b224f6 Mon Sep 17 00:00:00 2001 From: Astha Date: Fri, 29 Nov 2024 09:16:13 -0500 Subject: [PATCH 60/69] resolved stdexcept include errors: updated CK commit hash --- Dockerfile | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index f5cd2238d5..f030521ca7 100755 --- a/Dockerfile +++ b/Dockerfile @@ -114,7 +114,7 @@ DEBIAN_FRONTEND=noninteractive apt-get purge -y --allow-unauthenticated \ miopen-hip # TODO: it should be able to automatically get commit hash from requirements.txt -ARG CK_COMMIT=60afb5221962064f05608c1e87482ca93afabf54 +ARG CK_COMMIT=563c1e2384af95f9784d682b4590bfcae837b0c4 RUN wget -O ck.tar.gz https://www.github.com/ROCm/composable_kernel/archive/${CK_COMMIT}.tar.gz && \ tar zxvf ck.tar.gz &&\ cd composable_kernel-${CK_COMMIT} && \ diff --git a/requirements.txt b/requirements.txt index b2c6d2a3a4..6d7dcc7672 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@60afb5221962064f05608c1e87482ca93afabf54 -DCMAKE_BUILD_TYPE=Release +ROCm/composable_kernel@563c1e2384af95f9784d682b4590bfcae837b0c4 -DCMAKE_BUILD_TYPE=Release google/googletest@v1.14.0 From df67fda4a7da6e61c82dbfe43c40055242990e03 Mon Sep 17 00:00:00 2001 From: Astha Date: Sun, 1 Dec 2024 09:58:06 -0500 Subject: [PATCH 61/69] finished debugging integration of standalone codegen build - working now --- CMakeLists.txt | 6 ++++-- src/CMakeLists.txt | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 35272cc10e..49e6864084 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -154,7 +154,7 @@ if(NOT WIN32 AND NOT MIOPEN_WORKAROUND_USE_BOOST_FILESYSTEM) check_cxx_linker_flag(-lstdc++fs HAS_LIB_STD_FILESYSTEM) endif() -list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip) +list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip /usr/local/lib) option(ENABLE_HIP_WORKAROUNDS Off) set(MIOPEN_INSTALL_CXX_HEADERS Off CACHE BOOL "Install MIOpen's C++ header interface") @@ -327,7 +327,9 @@ add_compile_definitions($<$:HIP_COMPILER_FLAGS=${HIP_COMPI # HIP if( MIOPEN_BACKEND STREQUAL "HIP" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN_BACKEND STREQUAL "HIPNOGPU") if(MIOPEN_USE_COMPOSABLEKERNEL) - find_package(composable_kernel 1.0.0 COMPONENTS device_other_operations device_gemm_operations device_conv_operations device_reduction_operations ck_host) + include(/usr/local/lib/cmake/composable_kernel_host/ck_host_targets.cmake) + find_package(ck_host) + find_package(composable_kernel 1.0.0 COMPONENTS device_other_operations device_gemm_operations device_conv_operations device_reduction_operations ) endif() if( MIOPEN_BACKEND STREQUAL "HIPNOGPU") set(MIOPEN_MODE_NOGPU 1) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5678eb289e..63924003b8 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -334,7 +334,7 @@ set( MIOpen_Source solver/softmarginloss/forward_softmarginloss.cpp solver/softmax/attn_softmax.cpp solver/softmax/softmax.cpp - solver/tmp_conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp + solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp subbuffers.cpp t5layernorm_api.cpp target_properties.cpp @@ -843,6 +843,7 @@ endfunction() target_include_directories(MIOpen PUBLIC $ + /usr/local/lib/composable_kernel_host/include ) if(MIOPEN_USE_COMPOSABLEKERNEL) From 36ecea9ecba49c93f16c61eac52fe3b05a3f2452 Mon Sep 17 00:00:00 2001 From: Astha Date: Sun, 1 Dec 2024 10:04:43 -0500 Subject: [PATCH 62/69] cleaned up solver: removed commented code and debug prints --- ...ip_implicit_gemm_2d_grouped_fwd_xdlops.cpp | 104 ------------------ 1 file changed, 104 deletions(-) diff --git a/src/solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp b/src/solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp index 429b7842b3..878b1d9e80 100644 --- a/src/solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp +++ b/src/solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp @@ -129,12 +129,6 @@ struct CKArgs in_lengths = {prob.G, prob.N, prob.C, prob.Hi, prob.Wi}; out_lengths = {prob.G, prob.N, prob.K, prob.Ho, prob.Wo}; wei_lengths = {prob.G, prob.K, prob.C, prob.Y, prob.X}; - std::cout << "in lengths: " << prob.G << ", " << prob.N << ", " << prob.C << ", " << prob.Hi - << ", " << prob.Wi << std::endl; - std::cout << "weight lengths: " << prob.G << ", " << prob.K << ", " << prob.C << ", " - << prob.Y << ", " << prob.X << std::endl; - std::cout << "out lengths: " << prob.G << ", " << prob.N << ", " << prob.K << ", " - << prob.Ho << ", " << prob.Wo << std::endl; in_strides = {prob.C, prob.Hi * prob.Wi * prob.G * prob.C, @@ -151,13 +145,6 @@ struct CKArgs 1, prob.X * prob.C, prob.C}; - std::cout << "in strides: " << prob.C << ", " << prob.Hi * prob.Wi * prob.G * prob.C - << ", 1, " << prob.Wi * prob.G * prob.C << ", " << prob.G * prob.C << std::endl; - std::cout << "wei strides: " << prob.K * prob.Y * prob.X * prob.C << ", " - << prob.Y * prob.X * prob.C << ", 1, " << prob.X * prob.C << ", " << prob.C - << std::endl; - std::cout << "out strides: " << prob.K << ", " << prob.Ho * prob.Wo * prob.G * prob.K - << ", 1, " << prob.Wo * prob.G * prob.K << ", " << prob.G * prob.K << std::endl; filter_strides = {ProblemInterpreter::GetAdjustedConvolutionStrideH(problem), ProblemInterpreter::GetAdjustedConvolutionStrideW(problem)}; @@ -174,21 +161,6 @@ struct CKArgs CKArgs(CKArgs&&) noexcept = default; CKArgs& operator=(const CKArgs&) = default; - /**int G; - int N; - int K; - int C; - int C1; - int K1; - int Hi; - int Wi; - int Di; - int Ho; - int Wo; - int Do; - int Y; - int X; - int Z;**/ ck::host::conv::Problem_Conv_Fwd prob; ck::Array in_lengths; ck::Array in_strides; @@ -200,7 +172,6 @@ struct CKArgs ck::Array filter_dilations; ck::Array lPadding; ck::Array rPadding; - // miopenAlphaBetaCase_t alpha_beta_case; }; } // namespace @@ -218,60 +189,45 @@ bool ConvHipImplicitGemmGroupFwdXdlopsCodegen::IsApplicable( [[maybe_unused]] const ExecutionContext& ctx, [[maybe_unused]] const ProblemDescription& problem) const { - // FIXME: rewrite this function - std::cout << "####### entered isApplicable #######" << std::endl; - // return true; #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL - std::cout << "----------- entered the header guard -----------" << std::endl; if(env::disabled(MIOPEN_DEBUG_GROUP_CONV_IMPLICIT_GEMM_HIP_FWD_XDLOPS)) { - std::cout << "Check 1: false" << std::endl; return false; } if(problem.HasNonPackedTensors()) { - std::cout << "Check 2: false" << std::endl; return false; } if(!problem.AllTensorsDimsFitIntoInt()) { - std::cout << "Check 3: false" << std::endl; return false; } if(problem.IsTensorsCasted()) { - std::cout << "Check 4: false" << std::endl; return false; } if(problem.GetConv().attribute.deterministic) { - std::cout << "Check 5: false" << std::endl; return false; } if(problem.HasMixedDataTypes()) { - std::cout << "Check 6: false" << std::endl; return false; } if(!problem.IsDirectionForward()) { - std::cout << "Check 7: false" << std::endl; return false; } if(!problem.Is2d()) { - std::cout << "Check 8: false" << std::endl; return false; } if(!(problem.IsLayoutNHWC() || problem.IsLayoutDefault())) { - std::cout << "Check 9: false" << std::endl; return false; } - std::cout << "------ went through header guard checks ------" << std::endl; return true; #endif - std::cout << "never entered the header guard" << std::endl; return false; } @@ -307,33 +263,20 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( auto tmp = get_launch_params(solution[0], x.out_lengths, x.out_strides); auto grid_size = tmp * x.in_lengths[1]; - std::cout << " ------- grid size: " << grid_size << std::endl; kernel_info.l_wk = {256, 1, 1}; kernel_info.g_wk = {16384, 1, 1}; - std::cout << "block size: " << block_size << ", grid size: " << grid_size - << ", launch: " << block_size * grid_size << std::endl; bool bfp16parm = true; const auto build_params = KernelBuildParameters{{"MIOPEN_USE_FP16", static_cast(bfp16parm)}}; kernel_info.comp_options = build_params.GenerateFor(kbp::HIP{}); kernel_info.comp_options += " -DCK_DONT_USE_HIP_RUNTIME_HEADERS"; kernel_info.comp_options += " -DCK_CODE_GEN_RTC"; - // soln.construction_params.push_back(kernel_info); soln.invoker_factory = [=](const std::vector& kernels) { - std::cout << " ------------- outer lambda --------------------" << std::endl; return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { - std::cout << "----------- into inner lambda -----------" << std::endl; - // decltype(auto) kernel = handle_.Run(kernels.front()); decltype(auto) params = raw_params.CastTo(); - std::cout << " ------------- past param assignment -------------" << std::endl; - - std::cout << "=========== invoker factory ===============" << std::endl; - std::cout << "name of kernel: " << name << std::endl; - std::cout << "block size: " << block_size << ", grid size: " << grid_size - << ", launch: " << block_size * grid_size << std::endl; auto kernel = handle_.AddKernel("tmp", "tmp", "cg_main.cpp", @@ -343,38 +286,6 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( kernel_info.comp_options, 0, src); - std::cout << "in: " << params.tensors.inDesc << std::endl; - std::cout << "lens: " << params.tensors.inDesc.GetLengths().size() << std::endl; - // std::cout << "w: " << *params.tensors.w << std::endl; - // std::cout << "out: " << *params.tensors.out << std::endl; - std::cout << "conv: " << problem.GetConv() << std::endl; - std::cout << "in: " << problem.GetIn() << std::endl; - std::cout << "w: " << problem.GetWeights() << std::endl; - std::cout << "out: " << problem.GetOut() << std::endl; - std::cout << "in lengths: " << x.in_lengths[0] << ", " << x.in_lengths[1] << ", " - << x.in_lengths[2] << ", " << x.in_lengths[3] << ", " << x.in_lengths[4] - << ", " << std::endl; - std::cout << "w lengths: " << x.wei_lengths[0] << ", " << x.wei_lengths[1] << ", " - << x.wei_lengths[2] << ", " << x.wei_lengths[3] << ", " << x.wei_lengths[4] - << ", " << std::endl; - std::cout << "out lengths: " << x.out_lengths[0] << ", " << x.out_lengths[1] << ", " - << x.out_lengths[2] << ", " << x.out_lengths[3] << ", " << x.out_lengths[4] - << ", " << std::endl; - std::cout << "in strides: " << x.in_strides[0] << ", " << x.in_strides[1] << ", " - << x.in_strides[2] << ", " << x.in_strides[3] << ", " << x.in_strides[4] - << ", " << std::endl; - std::cout << "wei strides: " << x.wei_strides[0] << ", " << x.wei_strides[1] << ", " - << x.wei_strides[2] << ", " << x.wei_strides[3] << ", " << x.wei_strides[4] - << ", " << std::endl; - std::cout << "out strides: " << x.out_strides[0] << ", " << x.out_strides[1] << ", " - << x.out_strides[2] << ", " << x.out_strides[3] << ", " << x.out_strides[4] - << ", " << std::endl; - std::cout << "filter strides: " << x.filter_strides[0] << ", " << x.filter_strides[1] - << std::endl; - std::cout << "filter dilations: " << x.filter_dilations[0] << ", " - << x.filter_dilations[1] << std::endl; - std::cout << "left pad: " << x.lPadding[0] << ", " << x.lPadding[1] << std::endl; - std::cout << "right pad: " << x.rPadding[0] << ", " << x.rPadding[1] << std::endl; kernel(params.tensors.in, params.tensors.w, @@ -391,21 +302,6 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( x.rPadding); }; }; - // TODO: remove this, replace with lambda. MIOpen has it's own invoker to launch the kernel - // launch the kernel with arguments needed for the argument pointer - /**k.launch(nullptr, grid_size * block_size, block_size)(in_dev.data(), - wei_dev.data(), - out_dev.data(), - prob.in_lengths, - prob.in_strides, - prob.wei_lengths, - prob.wei_strides, - prob.out_lengths, - prob.out_strides, - prob.filter_strides, - prob.filter_dilations, - prob.lPadding, - prob.rPadding);**/ return soln; #else From fe1be1bb1c91291726a261b19c10dcf817213f1d Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Mon, 2 Dec 2024 07:36:53 +0000 Subject: [PATCH 63/69] resolved comments from reviews --- src/solver.cpp | 10 ++-- ...ip_implicit_gemm_2d_grouped_fwd_xdlops.cpp | 13 +++-- test/gtest/codegen_group_conv.hpp | 56 +++++++------------ 3 files changed, 32 insertions(+), 47 deletions(-) diff --git a/src/solver.cpp b/src/solver.cpp index 7c6f456e64..2cbbb33a06 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -686,11 +686,6 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) fusion::ConvWinoFuryRxSFused<2, 3>{}.SolverDbId(), miopenConvolutionAlgoWinograd); - RegisterWithSolver(registry, - ++id, - conv::ConvHipImplicitGemmGroupFwdXdlopsCodegen{}, - miopenConvolutionAlgoImplicitGEMM); - Register(registry, ++id, Primitive::RoPE, rope::RoPEForward{}.SolverDbId()); Register(registry, ++id, Primitive::RoPE, rope::RoPEBackward{}.SolverDbId()); Register(registry, ++id, Primitive::ReLU, prelu::MultiWeightsBackward{}.SolverDbId()); @@ -714,6 +709,11 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) multimarginloss::MultiMarginLossForward{}.SolverDbId()); Register(registry, ++id, Primitive::Mha, mha::MhaCKFlashAttentionV2Forward{}.SolverDbId()); + + RegisterWithSolver(registry, + ++id, + conv::ConvHipImplicitGemmGroupFwdXdlopsCodegen{}, + miopenConvolutionAlgoImplicitGEMM); // IMPORTANT: New solvers should be added to the end of the function, and don't leave a white // space between this comment and the newly registered solver(s)! } diff --git a/src/solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp b/src/solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp index 878b1d9e80..3d525a2ecf 100644 --- a/src/solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp +++ b/src/solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp @@ -228,15 +228,14 @@ bool ConvHipImplicitGemmGroupFwdXdlopsCodegen::IsApplicable( } return true; #endif - return false; } ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( [[maybe_unused]] const ExecutionContext& ctx, [[maybe_unused]] const ProblemDescription& problem) const { - auto x = CKArgs(problem); #if MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL + auto x = CKArgs(problem); const auto workspace_req = GetWorkspaceSize(ctx, problem); @@ -258,11 +257,13 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( kernel_info.kernel_name = "run_" + name; // Grid size calculation - auto block_size = solution[0].GetTemplateParameter("BlockSize"); + // FIXME: for some reason, the launch params that work in CK, don't work here. + // Launch params are hardcoded for now + // auto block_size = solution[0].GetTemplateParameter("BlockSize"); - auto tmp = get_launch_params(solution[0], x.out_lengths, x.out_strides); + // auto tmp = get_launch_params(solution[0], x.out_lengths, x.out_strides); - auto grid_size = tmp * x.in_lengths[1]; + // auto grid_size = tmp * x.in_lengths[1]; kernel_info.l_wk = {256, 1, 1}; kernel_info.g_wk = {16384, 1, 1}; @@ -277,7 +278,7 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( soln.invoker_factory = [=](const std::vector& kernels) { return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { decltype(auto) params = raw_params.CastTo(); - auto kernel = handle_.AddKernel("tmp", + auto kernel = handle_.AddKernel("tmp", "tmp", "cg_main.cpp", kernel_info.kernel_name, diff --git a/test/gtest/codegen_group_conv.hpp b/test/gtest/codegen_group_conv.hpp index 90f588b5d5..2ed0c7c221 100644 --- a/test/gtest/codegen_group_conv.hpp +++ b/test/gtest/codegen_group_conv.hpp @@ -288,7 +288,7 @@ struct GroupConvTestFix { EXPECT_FALSE(miopen::range_zero(ref)) << "Cpu data is all zeros"; EXPECT_FALSE(miopen::range_zero(computed)) << "Gpu data is all zeros"; - EXPECT_TRUE(miopen::range_distance(ref) == miopen::range_distance(computed)); + EXPECT_EQ(miopen::range_distance(ref), miopen::range_distance(computed)); /// \todo figure out a better threshold for error checking, esp. for bwd /// data and weight passes. --amberhassaan @@ -303,11 +303,10 @@ struct GroupConvTestFix } auto error = miopen::rms_range(ref, computed); - EXPECT_FALSE(miopen::find_idx(ref, miopen::not_finite) >= 0) + EXPECT_LT(miopen::find_idx(ref, miopen::not_finite), 0) << "Non finite number found in the reference output"; - EXPECT_TRUE(error <= threshold) - << "Error beyond tolerance Error:" << error << ", Threshold: " << threshold; + EXPECT_LE(error, threshold); } /// \todo had to pull out tensor and problem construction because the order of @@ -324,20 +323,12 @@ struct GroupConvTestFix std::cout << conv_config << std::endl; auto&& handle = get_handle(); - // static_assert(cg == true, "Cg is false"); - Solver solv{}; auto ctx = miopen::ExecutionContext{}; ctx.SetStream(&handle); - if(!solv.IsApplicable(ctx, problem)) - { - test_skipped = true; - GTEST_SKIP() << solv.SolverDbId() << "Not Applicable for this problem" << conv_config; - } - if(solv.MayNeedWorkspace()) { wspace.resize(solv.GetWorkspaceSize(ctx, problem)); @@ -347,13 +338,7 @@ struct GroupConvTestFix InvokeParamType{tensors, wspace.ptr(), wspace.size(), false, alpha, beta}; ASSERT_TRUE(solv.IsApplicable(ctx, problem)); - // auto sol = solv.GetSolution(ctx, problem); - // if(cg == false){ solv::foo = 1;} auto sol = solv.GetSolution(ctx, problem); - /**if constexpr(cg == false){ - sol = solv.GetSolution(ctx, problem, solv.GetDefaultPerformanceConfig(ctx, - problem)); - }**/ ASSERT_TRUE(sol.Succeeded()); ASSERT_TRUE(sol.invoker_factory); const auto invoker = handle.PrepareInvoker(*sol.invoker_factory, sol.construction_params); @@ -544,24 +529,23 @@ std::vector GetBetaValues() } } -#define DEFINE_GROUP_CONV_TEST(ndim, type, naming_type, dir, cg) \ - struct GPU_GroupConv##ndim##D_##dir##_##naming_type##_##cg \ - : GroupConvTestFix \ - { \ - }; \ - TEST_P(GPU_GroupConv##ndim##D_##dir##_##naming_type##_##cg, \ - GroupConv##ndim##D_##dir##_##type##_##cg##_Test) \ - { \ - RunSolver(); \ - } \ - INSTANTIATE_TEST_SUITE_P( \ - Full, \ - GPU_GroupConv##ndim##D_##dir##_##naming_type##_##cg, \ - testing::Combine( \ - testing::ValuesIn(GroupConvTestConfig::GetConfigs()), \ - testing::ValuesIn(GetAlphaValues()), \ - testing::ValuesIn(GetBetaValues()), \ +#define DEFINE_GROUP_CONV_TEST(ndim, type, naming_type, dir) \ + struct GPU_GroupConv##ndim##D_##dir##_##naming_type \ + : GroupConvTestFix \ + { \ + }; \ + TEST_P(GPU_GroupConv##ndim##D_##dir##_##naming_type, GroupConv##ndim##D_##dir##_##type##_Test) \ + { \ + RunSolver(); \ + } \ + INSTANTIATE_TEST_SUITE_P( \ + Full, \ + GPU_GroupConv##ndim##D_##dir##_##naming_type, \ + testing::Combine( \ + testing::ValuesIn(GroupConvTestConfig::GetConfigs()), \ + testing::ValuesIn(GetAlphaValues()), \ + testing::ValuesIn(GetBetaValues()), \ testing::ValuesIn(GetLayoutValues()))); #define DEFINE_CG_GROUP_CONV2D_TEST(type, naming_type, dir) \ - DEFINE_GROUP_CONV_TEST(2, type, naming_type, dir, true) + DEFINE_GROUP_CONV_TEST(2, type, naming_type, dir) From 802140b6863a93c2428db326e2dc34a6245efff2 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Mon, 2 Dec 2024 07:55:37 +0000 Subject: [PATCH 64/69] fixed errors in gtest file with extra variable --- test/gtest/codegen_group_conv.hpp | 48 +++++++++++-------------------- 1 file changed, 17 insertions(+), 31 deletions(-) diff --git a/test/gtest/codegen_group_conv.hpp b/test/gtest/codegen_group_conv.hpp index 2ed0c7c221..29b7e264fb 100644 --- a/test/gtest/codegen_group_conv.hpp +++ b/test/gtest/codegen_group_conv.hpp @@ -95,35 +95,22 @@ struct GroupConvTestConfig<2u> 1.0}; } - template + template static std::vector GetConfigs() { - if constexpr(cg == true) - { - return { - {32, 256, 32, 64, {28, 28}, {3, 3}, {1, 1}, {1, 1}, {1, 1}}, - }; - } - else + if constexpr(DIR == Direction::Forward) { - if constexpr(DIR == Direction::Forward) - { - // clang-format off + // clang-format off return { // g n C K img filter pad stride dilation - {1 , 256, 192 , 192 , {28, 28}, {3, 3}, {1, 1}, {1, 1}, {1, 1}}, - {1 , 256, 12 , 12 , {28, 28}, {3, 3}, {1, 1}, {1, 1}, {1, 1}}, - {4 , 256, 192 , 192 , {28, 28}, {3, 3}, {1, 1}, {1, 1}, {1, 1}}, - {8 , 256, 192 , 192 , {28, 28}, {3, 3}, {1, 1}, {1, 1}, {1, 1}}, - {8 , 256, 384 , 384 , {28, 28}, {3, 3}, {1, 1}, {1, 1}, {1, 1}}, - {32, 256, 1024, 2048, {28, 28}, {3, 3}, {1, 1}, {1, 1}, {1, 1}}, + {32, 256, 32, 64, {28, 28}, {3, 3}, {1, 1}, {1, 1}, {1, 1}}, }; - // clang-format on - } - else if constexpr(DIR == Direction::BackwardData || DIR == Direction::BackwardWeights) - { - // clang-format off + // clang-format on + } + else if constexpr(DIR == Direction::BackwardData || DIR == Direction::BackwardWeights) + { + // clang-format off return { // g n C K img filter pad stride dilation {1 , 1 , 1 , 1 , {28, 28}, {3, 3}, {1, 1}, {1, 1}, {1, 1}}, @@ -133,12 +120,11 @@ struct GroupConvTestConfig<2u> {8 , 256, 384 , 384 , {28, 28}, {2, 2}, {1, 1}, {1, 1}, {1, 1}}, {32, 256, 1024, 2048, {28, 28}, {3, 3}, {1, 1}, {1, 1}, {1, 1}}, }; - // clang-format on - } - else - { - std::abort(); - } + // clang-format on + } + else + { + std::abort(); } } }; @@ -199,7 +185,7 @@ struct GroupConvTestConfig<3u> 1.0}; } - template + template static std::vector GetConfigs() { @@ -249,7 +235,7 @@ struct GroupConvTestConfig<3u> } }; -template +template struct GroupConvTestFix : public ::testing::TestWithParam< std::tuple, float, float, miopenTensorLayout_t>> @@ -349,7 +335,7 @@ struct GroupConvTestFix template void DispatchSolver() { - if constexpr(cg == true && CONV_DIR == Direction::Forward) + if constexpr(CONV_DIR == Direction::Forward) { RunSolverImpl( miopen::ConvDataTensors{input.desc, From 703065a4a94f9492559a047f9537c204943d6cfa Mon Sep 17 00:00:00 2001 From: Astha Date: Mon, 2 Dec 2024 08:44:40 -0500 Subject: [PATCH 65/69] fixed naming issue in gtest --- test/gtest/codegen_group_conv.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/gtest/codegen_group_conv.hpp b/test/gtest/codegen_group_conv.hpp index 29b7e264fb..0894f26342 100644 --- a/test/gtest/codegen_group_conv.hpp +++ b/test/gtest/codegen_group_conv.hpp @@ -516,17 +516,17 @@ std::vector GetBetaValues() } #define DEFINE_GROUP_CONV_TEST(ndim, type, naming_type, dir) \ - struct GPU_GroupConv##ndim##D_##dir##_##naming_type \ + struct GPU_CGGroupConv##ndim##D_##dir##_##naming_type \ : GroupConvTestFix \ { \ }; \ - TEST_P(GPU_GroupConv##ndim##D_##dir##_##naming_type, GroupConv##ndim##D_##dir##_##type##_Test) \ + TEST_P(GPU_CGGroupConv##ndim##D_##dir##_##naming_type, CGGroupConv##ndim##D_##dir##_##type##_Test) \ { \ RunSolver(); \ } \ INSTANTIATE_TEST_SUITE_P( \ Full, \ - GPU_GroupConv##ndim##D_##dir##_##naming_type, \ + GPU_CGGroupConv##ndim##D_##dir##_##naming_type, \ testing::Combine( \ testing::ValuesIn(GroupConvTestConfig::GetConfigs()), \ testing::ValuesIn(GetAlphaValues()), \ From 19f37d00e6065a38b6e32e2559ea098729f26732 Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Mon, 2 Dec 2024 16:19:18 +0000 Subject: [PATCH 66/69] formatting fix --- test/gtest/codegen_group_conv.hpp | 33 ++++++++++++++++--------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/test/gtest/codegen_group_conv.hpp b/test/gtest/codegen_group_conv.hpp index 0894f26342..160078a68b 100644 --- a/test/gtest/codegen_group_conv.hpp +++ b/test/gtest/codegen_group_conv.hpp @@ -515,22 +515,23 @@ std::vector GetBetaValues() } } -#define DEFINE_GROUP_CONV_TEST(ndim, type, naming_type, dir) \ - struct GPU_CGGroupConv##ndim##D_##dir##_##naming_type \ - : GroupConvTestFix \ - { \ - }; \ - TEST_P(GPU_CGGroupConv##ndim##D_##dir##_##naming_type, CGGroupConv##ndim##D_##dir##_##type##_Test) \ - { \ - RunSolver(); \ - } \ - INSTANTIATE_TEST_SUITE_P( \ - Full, \ - GPU_CGGroupConv##ndim##D_##dir##_##naming_type, \ - testing::Combine( \ - testing::ValuesIn(GroupConvTestConfig::GetConfigs()), \ - testing::ValuesIn(GetAlphaValues()), \ - testing::ValuesIn(GetBetaValues()), \ +#define DEFINE_GROUP_CONV_TEST(ndim, type, naming_type, dir) \ + struct GPU_CGGroupConv##ndim##D_##dir##_##naming_type \ + : GroupConvTestFix \ + { \ + }; \ + TEST_P(GPU_CGGroupConv##ndim##D_##dir##_##naming_type, \ + CGGroupConv##ndim##D_##dir##_##type##_Test) \ + { \ + RunSolver(); \ + } \ + INSTANTIATE_TEST_SUITE_P( \ + Full, \ + GPU_CGGroupConv##ndim##D_##dir##_##naming_type, \ + testing::Combine( \ + testing::ValuesIn(GroupConvTestConfig::GetConfigs()), \ + testing::ValuesIn(GetAlphaValues()), \ + testing::ValuesIn(GetBetaValues()), \ testing::ValuesIn(GetLayoutValues()))); #define DEFINE_CG_GROUP_CONV2D_TEST(type, naming_type, dir) \ From 5ecbe39a9f2093a2c1fd864a0e591dfd67b41aca Mon Sep 17 00:00:00 2001 From: Astha Date: Tue, 7 Jan 2025 03:38:33 -0500 Subject: [PATCH 67/69] removed some unneeded code --- .../codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp b/src/solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp index 3d525a2ecf..ca0ae6d3fd 100644 --- a/src/solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp +++ b/src/solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp @@ -247,13 +247,9 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( auto src = ck::host::InterpolateString( conv_compile_check, {{"include", x.prob.GetIncludeHeader()}, {"template", solution[0].ToTemplateString()}}); - auto srcs = get_headers_for_test(); - srcs.push_back({"main.cpp", src}); auto name = solution[0].GetTemplateParameter("name"); auto kernel_info = KernelInfo{}; - auto path = std::strcat(std::getenv("HOME"), "/workspace/MIOpen/src/kernels/main.cpp"); - kernel_info.kernel_file = path; kernel_info.kernel_name = "run_" + name; // Grid size calculation From de1010314aba04211fa74b55862c5f5ed441e71c Mon Sep 17 00:00:00 2001 From: Astha Date: Tue, 7 Jan 2025 03:40:12 -0500 Subject: [PATCH 68/69] updating CK commit hash --- Dockerfile | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 02925d97c2..5b93f8e630 100755 --- a/Dockerfile +++ b/Dockerfile @@ -115,7 +115,7 @@ DEBIAN_FRONTEND=noninteractive apt-get purge -y --allow-unauthenticated \ # TODO: it should be able to automatically get commit hash from requirements.txt #TODO: swap my branch CK commit out with the proper commit once my branch in CK is merged -ARG CK_COMMIT=563c1e2384af95f9784d682b4590bfcae837b0c4 +ARG CK_COMMIT=887f76356d821af50596d279cd598a64d2d254f0 RUN wget -O ck.tar.gz https://www.github.com/ROCm/composable_kernel/archive/${CK_COMMIT}.tar.gz && \ tar zxvf ck.tar.gz &&\ cd composable_kernel-${CK_COMMIT} && \ diff --git a/requirements.txt b/requirements.txt index 6d7dcc7672..69295908b5 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@563c1e2384af95f9784d682b4590bfcae837b0c4 -DCMAKE_BUILD_TYPE=Release +ROCm/composable_kernel@887f76356d821af50596d279cd598a64d2d254f0 -DCMAKE_BUILD_TYPE=Release google/googletest@v1.14.0 From 61c44f9d81360d3a2e24ab552000bf9f25c28f39 Mon Sep 17 00:00:00 2001 From: Astha Date: Wed, 8 Jan 2025 16:06:10 -0500 Subject: [PATCH 69/69] updating to latest CK commit --- Dockerfile | 2 +- requirements.txt | 2 +- ...ip_implicit_gemm_2d_grouped_fwd_xdlops.cpp | 19 ------------------- 3 files changed, 2 insertions(+), 21 deletions(-) diff --git a/Dockerfile b/Dockerfile index 5b93f8e630..80069d599c 100755 --- a/Dockerfile +++ b/Dockerfile @@ -115,7 +115,7 @@ DEBIAN_FRONTEND=noninteractive apt-get purge -y --allow-unauthenticated \ # TODO: it should be able to automatically get commit hash from requirements.txt #TODO: swap my branch CK commit out with the proper commit once my branch in CK is merged -ARG CK_COMMIT=887f76356d821af50596d279cd598a64d2d254f0 +ARG CK_COMMIT=3b9a77df7e526d2fa30534e64da3fa779c3e7d9b RUN wget -O ck.tar.gz https://www.github.com/ROCm/composable_kernel/archive/${CK_COMMIT}.tar.gz && \ tar zxvf ck.tar.gz &&\ cd composable_kernel-${CK_COMMIT} && \ diff --git a/requirements.txt b/requirements.txt index 69295908b5..0207c01215 100755 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ nlohmann/json@v3.11.2 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off ROCm/FunctionalPlus@v0.2.18-p0 ROCm/eigen@3.4.0 ROCm/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50 -ROCm/composable_kernel@887f76356d821af50596d279cd598a64d2d254f0 -DCMAKE_BUILD_TYPE=Release +ROCm/composable_kernel@3b9a77df7e526d2fa30534e64da3fa779c3e7d9b -DCMAKE_BUILD_TYPE=Release google/googletest@v1.14.0 diff --git a/src/solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp b/src/solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp index ca0ae6d3fd..279ecdc0bb 100644 --- a/src/solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp +++ b/src/solver/codegen_conv_hip_implicit_gemm_2d_grouped_fwd_xdlops.cpp @@ -90,23 +90,6 @@ struct Epilogue )"; std::string prologue = ""; -// TODO: temporarily have these two here due to build issues with ck_rtc, remove once resolved -struct src_file -{ - std::filesystem::path path; - std::string_view content; -}; -std::vector get_headers_for_test() -{ - std::vector result; - auto hs = ck::host::GetHeaders(); - std::transform( - hs.begin(), hs.end(), std::back_inserter(result), [&](const auto& p) -> src_file { - return {p.first, p.second}; - }); - return result; -} - struct CKArgs { CKArgs(const ProblemDescription& problem) @@ -256,9 +239,7 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlopsCodegen::GetSolution( // FIXME: for some reason, the launch params that work in CK, don't work here. // Launch params are hardcoded for now // auto block_size = solution[0].GetTemplateParameter("BlockSize"); - // auto tmp = get_launch_params(solution[0], x.out_lengths, x.out_strides); - // auto grid_size = tmp * x.in_lengths[1]; kernel_info.l_wk = {256, 1, 1};