Skip to content

Commit 5e1f431

Browse files
committed
Merge branch 'develop' into poyenc/integrate-fmha-fwd-v2-v3-apis
2 parents 0e29033 + eb7f617 commit 5e1f431

File tree

72 files changed

+6035
-1134
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

72 files changed

+6035
-1134
lines changed

example/09_convnd_fwd/convnd_fwd_common.hpp

Lines changed: 62 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,16 @@
1818
#include "ck/library/utility/convolution_parameter.hpp"
1919
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
2020
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
21+
#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp"
22+
#include "ck_tile/host/hip_check_error.hpp"
2123

2224
using ::ck::DeviceMem;
2325
using ::ck::HostTensorDescriptor;
2426
using ::ck::Tensor;
2527

2628
void print_helper_msg()
2729
{
28-
std::cout << "arg1: verification (0=no, 1=yes)\n"
30+
std::cout << "arg1: verification (0=no, 1=CPU, 2=GPU)\n"
2931
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
3032
<< "arg3: time kernel (0=no, 1=yes)\n"
3133
<< ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
@@ -130,7 +132,7 @@ template <ck::index_t NDimSpatial,
130132
typename OutElementOp,
131133
typename DeviceConvNDFwdInstance,
132134
typename ComputeDataType = OutDataType>
133-
bool run_grouped_conv_fwd(bool do_verification,
135+
bool run_grouped_conv_fwd(int do_verification,
134136
int init_method,
135137
bool time_kernel,
136138
const ck::utils::conv::ConvParam& conv_param,
@@ -233,8 +235,11 @@ bool run_grouped_conv_fwd(bool do_verification,
233235
std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
234236
<< conv.GetTypeString() << std::endl;
235237

236-
if(do_verification)
238+
std::cout << "do_verification = " << do_verification << std::endl;
239+
240+
if(do_verification == 1)
237241
{
242+
// CPU verification
238243
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
239244
InDataType,
240245
WeiDataType,
@@ -269,6 +274,60 @@ bool run_grouped_conv_fwd(bool do_verification,
269274
get_rtol<OutDataType, ComputeDataType>(),
270275
get_atol<OutDataType, ComputeDataType>());
271276
}
277+
else if(do_verification == 2)
278+
{
279+
// GPU verification using naive GPU reference
280+
std::cout << "Running GPU verification..." << std::endl;
281+
282+
// Allocate and ZERO GPU memory for reference output
283+
DeviceMem out_device_ref_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
284+
out_device_ref_buf.SetZero();
285+
286+
// Extract dimensions using helper function
287+
ck::ref::ConvDims dims = ck::utils::conv::extract_conv_dims(conv_param, NDimSpatial);
288+
289+
// Launch GPU reference kernel
290+
constexpr ck::index_t block_size = 256;
291+
const ck::long_index_t output_length = dims.N * dims.Do * dims.Ho * dims.Wo * dims.K;
292+
const ck::index_t grid_size = (output_length + block_size - 1) / block_size;
293+
294+
auto gpu_ref_kernel = ck::ref::naive_conv_fwd_ndhwc_kzyxc_ndhwk<InDataType,
295+
WeiDataType,
296+
OutDataType,
297+
ComputeDataType,
298+
InElementOp,
299+
WeiElementOp,
300+
OutElementOp>;
301+
302+
gpu_ref_kernel<<<dim3(grid_size), dim3(block_size), 0, nullptr>>>(
303+
reinterpret_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
304+
reinterpret_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
305+
reinterpret_cast<OutDataType*>(out_device_ref_buf.GetDeviceBuffer()),
306+
dims);
307+
308+
HIP_CHECK_ERROR(hipDeviceSynchronize());
309+
310+
std::cout << "GPU reference kernel completed successfully, copying results..." << std::endl;
311+
312+
// Copy GPU reference result to host
313+
out_device_ref_buf.FromDevice(out_host.mData.data());
314+
315+
// Copy GPU kernel result to host
316+
out_device_buf.FromDevice(out_device.mData.data());
317+
318+
std::cout << "Comparing GPU kernel output vs GPU reference..." << std::endl;
319+
320+
// Compare GPU kernel vs GPU reference
321+
bool pass = ck::utils::check_err(out_device,
322+
out_host,
323+
"Error: incorrect results!",
324+
get_rtol<OutDataType, ComputeDataType>(),
325+
get_atol<OutDataType, ComputeDataType>());
326+
327+
std::cout << "GPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
328+
329+
return pass;
330+
}
272331

273332
return true;
274333
}

example/09_convnd_fwd/convnd_fwd_dl_common.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ using ::ck::Tensor;
2525

2626
void print_helper_msg()
2727
{
28-
std::cout << "arg1: verification (0=no, 1=yes)\n"
28+
std::cout << "arg1: verification (0=no, 1=CPU)\n"
2929
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
3030
<< "arg3: time kernel (0=no, 1=yes)\n"
3131
<< ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
@@ -162,6 +162,7 @@ bool run_grouped_conv_fwd_dl(bool do_verification,
162162

163163
if(do_verification)
164164
{
165+
// CPU verification only (DL variants are fused operations)
165166
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<
166167
NDimSpatial,
167168
InDataType,

example/09_convnd_fwd/run_convnd_fwd_example.inc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@ bool run_convnd_fwd_example(int argc, char* argv[])
1212
{
1313
print_helper_msg();
1414

15-
bool do_verification = true;
16-
int init_method = 1;
17-
bool time_kernel = false;
15+
int do_verification = 1; // 0=no, 1=CPU, 2=GPU
16+
int init_method = 1;
17+
bool time_kernel = false;
1818

1919
ck::utils::conv::ConvParam conv_param{
2020
2, 1, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};

example/17_convnd_bwd_data/convnd_bwd_data_common.hpp

Lines changed: 103 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,58 @@
1717
#include "ck/library/utility/convolution_parameter.hpp"
1818
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
1919
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
20+
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp"
21+
#include "ck_tile/host/hip_check_error.hpp"
2022

2123
using ::ck::DeviceMem;
2224
using ::ck::HostTensorDescriptor;
2325
using ::ck::Tensor;
2426

27+
template <typename DataType, typename GemmType = DataType>
28+
inline __host__ __device__ constexpr double get_rtol()
29+
{
30+
if constexpr(std::is_same_v<DataType, float> && std::is_same_v<GemmType, ck::tf32_t>)
31+
return 5e-3;
32+
else if constexpr(std::is_same_v<DataType, float>)
33+
return 1e-3;
34+
else if constexpr(std::is_same_v<DataType, double>)
35+
return 1e-6;
36+
else if constexpr(std::is_same_v<DataType, ck::half_t>)
37+
return 1e-3;
38+
else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
39+
return 5e-2;
40+
else if constexpr(std::is_same_v<DataType, ck::f8_t>)
41+
return 1e-1;
42+
else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
43+
return 1.5e-1;
44+
else
45+
return 1e-3;
46+
}
47+
48+
template <typename DataType, typename GemmType = DataType>
49+
inline __host__ __device__ constexpr double get_atol()
50+
{
51+
if constexpr(std::is_same_v<DataType, float> && std::is_same_v<GemmType, ck::tf32_t>)
52+
return 1e-3;
53+
else if constexpr(std::is_same_v<DataType, float>)
54+
return 1e-3;
55+
else if constexpr(std::is_same_v<DataType, double>)
56+
return 1e-6;
57+
else if constexpr(std::is_same_v<DataType, ck::half_t>)
58+
return 1e-3;
59+
else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
60+
return 5e-2;
61+
else if constexpr(std::is_same_v<DataType, ck::f8_t>)
62+
return 16.1;
63+
else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
64+
return 16.1;
65+
else
66+
return 1e-3;
67+
}
68+
2569
void print_helper_msg()
2670
{
27-
std::cout << "arg1: verification (0=no, 1=yes)\n"
71+
std::cout << "arg1: verification (0=no, 1=CPU, 2=GPU)\n"
2872
<< "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
2973
<< "arg3: time kernel (0=no, 1=yes)\n"
3074
<< ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
@@ -38,7 +82,7 @@ template <ck::index_t NDimSpatial,
3882
typename WeiElementOp,
3983
typename OutElementOp,
4084
typename DeviceConvNdBwdDataInstance>
41-
int run_conv_bwd_data(bool do_verification,
85+
int run_conv_bwd_data(int do_verification,
4286
int init_method,
4387
bool time_kernel,
4488
const ck::utils::conv::ConvParam& conv_param,
@@ -128,26 +172,30 @@ int run_conv_bwd_data(bool do_verification,
128172
wei_element_op,
129173
out_element_op);
130174

175+
// Check if optimized kernel supports these parameters
131176
if(!conv.IsSupportedArgument(argument.get()))
132177
{
133178
std::cout << "Not support,please check parameters or device";
134179
return 0;
135180
}
136181

182+
// Run optimized kernel
137183
float ave_time = invoker.Run(argument.get(), StreamConfig{nullptr, time_kernel});
138184

139185
std::size_t flop = conv_param.GetFlops();
140186
std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
141187

142-
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
143-
188+
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
144189
float gb_per_sec = num_btype / 1.E6 / ave_time;
145190

146191
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
147192
<< std::endl;
148193

149-
if(do_verification)
194+
std::cout << "do_verification = " << do_verification << std::endl;
195+
196+
if(do_verification == 1)
150197
{
198+
// CPU verification
151199
auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdData<NDimSpatial,
152200
InDataType,
153201
WeiDataType,
@@ -175,6 +223,56 @@ int run_conv_bwd_data(bool do_verification,
175223

176224
return ck::utils::check_err(in_device, in_host) ? 0 : 1;
177225
}
226+
else if(do_verification == 2)
227+
{
228+
// GPU verification
229+
std::cout << "Running GPU verification..." << std::endl;
230+
231+
DeviceMem in_device_ref_buf(sizeof(InDataType) * in_device.mDesc.GetElementSpaceSize());
232+
in_device_ref_buf.SetZero();
233+
234+
// Extract dimensions using helper function
235+
ck::ref::ConvDims dims = ck::utils::conv::extract_conv_dims(conv_param, NDimSpatial);
236+
237+
constexpr ck::index_t block_size = 256;
238+
const ck::long_index_t input_length = dims.N * dims.Di * dims.Hi * dims.Wi * dims.C;
239+
const ck::index_t grid_size = (input_length + block_size - 1) / block_size;
240+
241+
auto gpu_ref_kernel = ck::ref::naive_conv_bwd_data_ndhwc_kzyxc_ndhwk<InDataType,
242+
WeiDataType,
243+
OutDataType,
244+
float,
245+
InElementOp,
246+
WeiElementOp,
247+
OutElementOp>;
248+
249+
gpu_ref_kernel<<<dim3(grid_size), dim3(block_size), 0, nullptr>>>(
250+
reinterpret_cast<InDataType*>(in_device_ref_buf.GetDeviceBuffer()),
251+
reinterpret_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
252+
reinterpret_cast<const OutDataType*>(out_device_buf.GetDeviceBuffer()),
253+
dims);
254+
255+
HIP_CHECK_ERROR(hipDeviceSynchronize());
256+
257+
std::cout << "GPU reference kernel completed, copying results..." << std::endl;
258+
259+
// Copy GPU reference result
260+
Tensor<InDataType> in_gpu_ref(in_host.mDesc);
261+
in_device_ref_buf.FromDevice(in_gpu_ref.mData.data());
262+
263+
// Copy optimized kernel result
264+
in_device_buf.FromDevice(in_device.mData.data());
265+
266+
// Compare: Optimized kernel result vs GPU reference result
267+
bool pass = ck::utils::check_err(in_device,
268+
in_gpu_ref,
269+
"Error: Incorrect results!",
270+
get_rtol<InDataType, float>(),
271+
get_atol<InDataType, float>());
272+
std::cout << "GPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
273+
274+
return pass ? 0 : 1;
275+
}
178276

179277
return 0;
180278
}

example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,9 @@ int main(int argc, char* argv[])
6363

6464
print_helper_msg();
6565

66-
bool do_verification = true;
67-
int init_method = 1;
68-
bool time_kernel = false;
66+
int do_verification = 1; // 0=no, 1=CPU, 2=GPU
67+
int init_method = 1;
68+
bool time_kernel = false;
6969

7070
ck::utils::conv::ConvParam conv_param{
7171
2, 1, 128, 256, 256, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};

example/20_grouped_conv_bwd_weight/common.hpp

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "ck/library/utility/convolution_parameter.hpp"
2020
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
2121
#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp"
22+
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp"
2223

2324
using ::ck::DeviceMem;
2425
using ::ck::HostTensorDescriptor;
@@ -38,6 +39,48 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
3839
static constexpr auto ConvBwdWeightDefault =
3940
ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default;
4041

42+
template <typename DataType, typename GemmType = DataType>
43+
inline __host__ __device__ constexpr double get_rtol()
44+
{
45+
if constexpr(std::is_same_v<DataType, float> && std::is_same_v<GemmType, ck::tf32_t>)
46+
return 5e-3;
47+
else if constexpr(std::is_same_v<DataType, float>)
48+
return 1e-3;
49+
else if constexpr(std::is_same_v<DataType, double>)
50+
return 1e-6;
51+
else if constexpr(std::is_same_v<DataType, ck::half_t>)
52+
return 1e-3;
53+
else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
54+
return 5e-2;
55+
else if constexpr(std::is_same_v<DataType, ck::f8_t>)
56+
return 1e-1;
57+
else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
58+
return 1.5e-1;
59+
else
60+
return 1e-3;
61+
}
62+
63+
template <typename DataType, typename GemmType = DataType>
64+
inline __host__ __device__ constexpr double get_atol()
65+
{
66+
if constexpr(std::is_same_v<DataType, float> && std::is_same_v<GemmType, ck::tf32_t>)
67+
return 1e-3;
68+
else if constexpr(std::is_same_v<DataType, float>)
69+
return 1e-3;
70+
else if constexpr(std::is_same_v<DataType, double>)
71+
return 1e-6;
72+
else if constexpr(std::is_same_v<DataType, ck::half_t>)
73+
return 1e-3;
74+
else if constexpr(std::is_same_v<DataType, ck::bhalf_t>)
75+
return 5e-2;
76+
else if constexpr(std::is_same_v<DataType, ck::f8_t>)
77+
return 16.1;
78+
else if constexpr(std::is_same_v<DataType, ck::bf8_t>)
79+
return 16.1;
80+
else
81+
return 1e-3;
82+
}
83+
4184
template <typename InputLay, typename WeightLay, typename OutputLay>
4285
struct CommonLayoutSetting
4386
{
@@ -75,9 +118,9 @@ using OutputLayout = typename CommonLayoutSettingSelector<NDimSpatial>::OutputLa
75118

76119
struct ExecutionConfig final
77120
{
78-
bool do_verification = true;
79-
int init_method = 1;
80-
bool time_kernel = false;
121+
int do_verification = 1; // 0=no, 1=CPU, 2=GPU
122+
int init_method = 1;
123+
bool time_kernel = false;
81124
};
82125

83126
#define DefaultConvParam \

0 commit comments

Comments
 (0)