diff --git a/driver/softmax_driver.hpp b/driver/softmax_driver.hpp index b683ff1ed3..9a8ecbf597 100644 --- a/driver/softmax_driver.hpp +++ b/driver/softmax_driver.hpp @@ -115,6 +115,7 @@ class SoftmaxDriver : public Driver float beta; miopenSoftmaxAlgorithm_t algo; miopenSoftmaxMode_t mode; + bool isForward = false; }; template @@ -181,6 +182,8 @@ std::vector SoftmaxDriver::GetInputTensorLengthsFromCmdLine() int in_h = inflags.GetValueInt("in_h"); int in_w = inflags.GetValueInt("in_w"); + isForward = inflags.GetValueInt("forw") == 1; + return std::vector({in_n, in_c, in_h, in_w}); } @@ -220,6 +223,15 @@ int SoftmaxDriver::AllocateBuffersAndCopy() dout[i] = Data_scale * prng::gen_A_to_B(static_cast(-0.5), static_cast(0.5)); } + // if bwd then initialize the y + if(!isForward) + { + for(int i = 0; i < out_sz; i++) + { + out[i] = prng::gen_A_to_B(static_cast(-5.0), static_cast(5.0)); + } + } + status_t status; status = in_dev->ToGPU(q, in.data()); status |= out_dev->ToGPU(q, out.data()); diff --git a/src/kernels/MIOpenSoftmax.cl b/src/kernels/MIOpenSoftmax.cl index 74667cc3d8..504753a267 100644 --- a/src/kernels/MIOpenSoftmax.cl +++ b/src/kernels/MIOpenSoftmax.cl @@ -65,6 +65,12 @@ #define USE_SOFTMAX_LOG 0 #endif +#if USE_SOFTMAX_LOG && MIOPEN_USE_FP16 +#define _FLOAT_ACCUM float +#else +#define _FLOAT_ACCUM _FLOAT +#endif + #ifndef USE_SOFTMAX_ACCURATE #define USE_SOFTMAX_ACCURATE 0 #endif @@ -689,7 +695,8 @@ __kernel void SoftmaxBackward(global _FLOAT* y, { #if NUM_BATCH == 1 // CSR-Vector like appraoch - local _FLOAT l_helper[256]; + + local _FLOAT_ACCUM l_helper[256]; int gid = get_group_id(0); int lid = get_local_id(0); @@ -705,7 +712,7 @@ __kernel void SoftmaxBackward(global _FLOAT* y, int s1 = s % input_w; #endif - _FLOAT channel_dot = (_FLOAT)0; // thread_local helper var + _FLOAT_ACCUM channel_dot = (_FLOAT_ACCUM)0; // thread_local helper var // Compute dot product per channel // Iterate over all the channels one thread is supposed to loop over @@ -835,7 +842,7 @@ __kernel void SoftmaxBackward(global _FLOAT* y, #else - local _FLOAT l_helper[256]; + local _FLOAT_ACCUM l_helper[256]; int gid = get_group_id(0); int lid = get_local_id(0); @@ -845,13 +852,13 @@ __kernel void SoftmaxBackward(global _FLOAT* y, int batch = lid / BATCH_SIZE; // which spatial_dim or pixel // Batch specific n and s - int batch_n = (NUM_BATCH * gid + batch) / spatial_dim; // nth image - int batch_s = (NUM_BATCH * gid + batch) % spatial_dim; // which spatial_dim/pixel + int batch_n = (NUM_BATCH * gid + batch) / spatial_dim; // nth image + int batch_s = (NUM_BATCH * gid + batch) % spatial_dim; // which spatial_dim/pixel #if(!IS_DINPUT_PACKED || !IS_DOUTPUT_PACKED || !IS_OUTPUT_PACKED) && USE_SOFTMAX_MODE_CHANNEL - int batch_s0 = batch_s / input_w; - int batch_s1 = batch_s % input_w; + int batch_s0 = batch_s / input_w; + int batch_s1 = batch_s % input_w; #endif - _FLOAT channel_dot = (_FLOAT)(0); // thread_local helper var + _FLOAT_ACCUM channel_dot = (_FLOAT)0; // thread_local helper var // stores all the values touched by one thread so that we do not have load // again as the CSR-Vector approach