diff --git a/driver/softmax_driver.hpp b/driver/softmax_driver.hpp
index b683ff1ed3..9a8ecbf597 100644
--- a/driver/softmax_driver.hpp
+++ b/driver/softmax_driver.hpp
@@ -115,6 +115,7 @@ class SoftmaxDriver : public Driver
     float beta;
     miopenSoftmaxAlgorithm_t algo;
     miopenSoftmaxMode_t mode;
+    bool isForward = false;
 };
 
 template <typename Tgpu, typename Tref>
@@ -181,6 +182,8 @@ std::vector<int> SoftmaxDriver<Tgpu, Tref>::GetInputTensorLengthsFromCmdLine()
     int in_h = inflags.GetValueInt("in_h");
     int in_w = inflags.GetValueInt("in_w");
 
+    isForward = inflags.GetValueInt("forw") == 1;
+
     return std::vector<int>({in_n, in_c, in_h, in_w});
 }
 
@@ -220,6 +223,15 @@ int SoftmaxDriver<Tgpu, Tref>::AllocateBuffersAndCopy()
         dout[i] = Data_scale * prng::gen_A_to_B(static_cast<Tgpu>(-0.5), static_cast<Tgpu>(0.5));
     }
 
+    // if bwd then initialize the y
+    if(!isForward)
+    {
+        for(int i = 0; i < out_sz; i++)
+        {
+            out[i] = prng::gen_A_to_B(static_cast<Tgpu>(-5.0), static_cast<Tgpu>(5.0));
+        }
+    }
+
     status_t status;
     status = in_dev->ToGPU(q, in.data());
     status |= out_dev->ToGPU(q, out.data());
diff --git a/src/kernels/MIOpenSoftmax.cl b/src/kernels/MIOpenSoftmax.cl
index 74667cc3d8..504753a267 100644
--- a/src/kernels/MIOpenSoftmax.cl
+++ b/src/kernels/MIOpenSoftmax.cl
@@ -65,6 +65,12 @@
 #define USE_SOFTMAX_LOG 0
 #endif
 
+#if USE_SOFTMAX_LOG && MIOPEN_USE_FP16
+#define _FLOAT_ACCUM float
+#else
+#define _FLOAT_ACCUM _FLOAT
+#endif
+
 #ifndef USE_SOFTMAX_ACCURATE
 #define USE_SOFTMAX_ACCURATE 0
 #endif
@@ -689,7 +695,8 @@ __kernel void SoftmaxBackward(global _FLOAT* y,
 {
 
 #if NUM_BATCH == 1 // CSR-Vector like appraoch
-    local _FLOAT l_helper[256];
+
+    local _FLOAT_ACCUM l_helper[256];
 
     int gid = get_group_id(0);
     int lid = get_local_id(0);
@@ -705,7 +712,7 @@ __kernel void SoftmaxBackward(global _FLOAT* y,
         int s1 = s % input_w;
 #endif
 
-        _FLOAT channel_dot = (_FLOAT)0; // thread_local helper var
+        _FLOAT_ACCUM channel_dot = (_FLOAT_ACCUM)0; // thread_local helper var
 
         // Compute dot product per channel
         // Iterate over all the channels one thread is supposed to loop over
@@ -835,7 +842,7 @@ __kernel void SoftmaxBackward(global _FLOAT* y,
 
 #else
 
-    local _FLOAT l_helper[256];
+    local _FLOAT_ACCUM l_helper[256];
 
     int gid = get_group_id(0);
     int lid = get_local_id(0);
@@ -845,13 +852,13 @@ __kernel void SoftmaxBackward(global _FLOAT* y,
     int batch     = lid / BATCH_SIZE;       // which spatial_dim or pixel
 
     // Batch specific n and s
-    int batch_n        = (NUM_BATCH * gid + batch) / spatial_dim; // nth image
-    int batch_s        = (NUM_BATCH * gid + batch) % spatial_dim; // which spatial_dim/pixel
+    int batch_n              = (NUM_BATCH * gid + batch) / spatial_dim; // nth image
+    int batch_s              = (NUM_BATCH * gid + batch) % spatial_dim; // which spatial_dim/pixel
 #if(!IS_DINPUT_PACKED || !IS_DOUTPUT_PACKED || !IS_OUTPUT_PACKED) && USE_SOFTMAX_MODE_CHANNEL
-    int batch_s0       = batch_s / input_w;
-    int batch_s1       = batch_s % input_w;
+    int batch_s0             = batch_s / input_w;
+    int batch_s1             = batch_s % input_w;
 #endif
-    _FLOAT channel_dot = (_FLOAT)(0); // thread_local helper var
+    _FLOAT_ACCUM channel_dot = (_FLOAT)0; // thread_local helper var
 
 // stores all the values touched by one thread so that we do not have load
 // again as the CSR-Vector approach