diff --git a/src/kernels/MIOpenTensorKernelsHip.cpp b/src/kernels/MIOpenTensorKernelsHip.cpp index 02f258c8a8..e168a8d811 100644 --- a/src/kernels/MIOpenTensorKernelsHip.cpp +++ b/src/kernels/MIOpenTensorKernelsHip.cpp @@ -23,6 +23,7 @@ * SOFTWARE. * *******************************************************************************/ + #ifndef MIOPEN_DONT_USE_HIP_RUNTIME_HEADERS #include #include @@ -342,3 +343,84 @@ extern "C" __global__ void Op4dTensorGeneric(MIOPEN_TYPE* a, } #endif + +#ifdef USE_4D_TENSOR_GENERIC_2 +// NCHW +extern "C" __global__ void Op4dTensorGeneric2(const MIOPEN_TYPE* a, + const MIOPEN_TYPE* b, + MIOPEN_TYPE* c, + const uint64_t Aoffset, + const uint64_t Boffset, + const uint64_t Coffset, + const uint32_t b_c, + const uint32_t b_h, + const uint32_t b_w, + const uint32_t c_c, + const uint32_t c_h, + const uint32_t c_w, + const uint32_t a_nstride, + const uint32_t a_cstride, + const uint32_t a_hstride, + const uint32_t a_wstride, + const uint32_t b_nstride, + const uint32_t b_cstride, + const uint32_t b_hstride, + const uint32_t b_wstride, + const uint32_t c_nstride, + const uint32_t c_cstride, + const uint32_t c_hstride, + const uint32_t c_wstride, + const MIOPEN_TYPE alpha0, + const MIOPEN_TYPE alpha1, + const MIOPEN_TYPE beta, + const uint32_t total_work, + const bool use_beta) +{ + const MIOPEN_TYPE* a_off = a + Aoffset; + const MIOPEN_TYPE* b_off = b + Boffset; + MIOPEN_TYPE* c_off = c + Coffset; + + auto gid = blockIdx.x * blockDim.x + threadIdx.x; + const auto* a_ptr = a_off + (gid / (c_c * c_h * c_w)) * a_nstride + + ((gid % (c_c * c_h * c_w)) / (c_h * c_w)) * a_cstride + + ((gid % (c_h * c_w)) / c_w) * a_hstride + (gid % c_w) * a_wstride; + auto* c_ptr = c_off + (gid / (c_c * c_h * c_w)) * c_nstride + + ((gid % (c_c * c_h * c_w)) / (c_h * c_w)) * c_cstride + + ((gid % (c_h * c_w)) / c_w) * c_hstride + (gid % c_w) * c_wstride; + + const auto step = gridDim.x * blockDim.x; + const auto a_step = (step / (c_c * c_h * c_w)) * a_nstride + + ((step % (c_c * c_h * c_w)) / (c_h * c_w)) * a_cstride + + ((step % (c_h * c_w)) / c_w) * a_hstride + (step % c_w) * a_wstride; + const auto c_step = (step / (c_c * c_h * c_w)) * c_nstride + + ((step % (c_c * c_h * c_w)) / (c_h * c_w)) * c_cstride + + ((step % (c_h * c_w)) / c_w) * c_hstride + (step % c_w) * c_wstride; + + const auto c_end = c_off + total_work * c_nstride; + while(c_ptr < c_end) + { + const auto* b_ptr = b_off; + if(b_nstride != 0) + b_ptr += (gid / (b_c * b_h * b_w)) * b_nstride; + + if(b_cstride != 0) + b_ptr += ((gid % (b_c * b_h * b_w)) / (b_h * b_w)) * b_cstride; + + if(b_hstride != 0) + b_ptr += ((gid % (b_h * b_w)) / b_w) * b_hstride; + + if(b_wstride != 0) + b_ptr += (gid % b_w) * b_wstride; + + auto a_val = *a_ptr; + auto b_val = *b_ptr; + auto c_val = use_beta ? *c_ptr : static_cast(0); + *c_ptr = MIOPEN_TENSOR_OP(a_val * alpha0, b_val * alpha1) + c_val * beta; + + a_ptr += a_step; + c_ptr += c_step; + gid += step; + } +} + +#endif