From dc89682138209bfffe01989f285a6f3ea812774f Mon Sep 17 00:00:00 2001
From: "Nichols A. Romero" <nick.romero@amd.com>
Date: Thu, 25 Jul 2024 15:40:41 +0000
Subject: [PATCH 1/4] Add fclang-abi-compat flag to HIP_HCC_FLAGS to match
 upstream PyTorch.

---
 fbgemm_gpu/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)
diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
index 0d454ff580..4bf31497e1 100644
--- a/fbgemm_gpu/CMakeLists.txt
+++ b/fbgemm_gpu/CMakeLists.txt
@@ -201,6 +201,7 @@ if(USE_ROCM)
   list(APPEND HIP_HCC_FLAGS
     " \"-Wno-#pragma-messages\" "
     " \"-Wno-#warnings\" "
+    -fclang-abi-compat=17
     -Wno-cuda-compat
     -Wno-deprecated-declarations
     -Wno-format

From 1c455dd9899f91cea2a0de3a785b640bfed1f16e Mon Sep 17 00:00:00 2001
From: "Nichols A. Romero" <nick.romero@amd.com>
Date: Thu, 25 Jul 2024 15:55:51 +0000
Subject: [PATCH 2/4] Initialize variables in support of ROCm 6.2 enablement.

---
 fbgemm_gpu/codegen/genscript/optimizers.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fbgemm_gpu/codegen/genscript/optimizers.py b/fbgemm_gpu/codegen/genscript/optimizers.py
index b056c3c1ba..bd758fe76c 100644
--- a/fbgemm_gpu/codegen/genscript/optimizers.py
+++ b/fbgemm_gpu/codegen/genscript/optimizers.py
@@ -175,6 +175,8 @@ def rowwise_adagrad() -> Dict[str, Any]:
 
     at::acc_type<cache_t, true> multiplier;
     at::acc_type<cache_t, true> correction;
+    multiplier = 0.0;
+    correction = 0.0;
     if (threadIdx.x == 0) {
         at::acc_type<cache_t, true> new_sum_square_grads = momentum1[idx] + g_avg_square;
         momentum1[idx] = new_sum_square_grads;
@@ -485,6 +487,8 @@ def rowwise_adagrad_with_counter() -> Dict[str, Any]:
 
     at::acc_type<cache_t, true> adjusted_multiplier;
     at::acc_type<cache_t, true> exp_reg_correction;
+    adjusted_multiplier = 0.0;
+    exp_reg_correction = 0.0;
 
     if (threadIdx.x == 0) {
         at::acc_type<cache_t, true> new_sum_square_grads = momentum1[idx] + g_avg_square;
@@ -852,6 +856,7 @@ def partial_rowwise_lamb() -> Dict[str, Any]:
         warpReduceAllSum<at::acc_type<cache_t, true>, kThreadGroupSize>(g_local_sum_square, shfl_sync_mask) / D;
 
     at::acc_type<cache_t, true> m2;
+    m2  = 0.0;
     if (threadIdx.x == 0) {
         m2 = beta2 * momentum2[idx] + (1.0 - beta2) * g_avg_square;
         momentum2[idx] = m2;
@@ -998,6 +1003,7 @@ def partial_rowwise_adam() -> Dict[str, Any]:
         warpReduceAllSum<at::acc_type<cache_t, true>, kThreadGroupSize>(g_local_sum_square) / D;
 
     at::acc_type<cache_t, true> v_hat_t;
+    v_hat_t = 0.0;
     if (threadIdx.x == 0) {
         at::acc_type<cache_t, true> v_t = momentum2[idx] * beta2 + g_avg_square * (1.0 - beta2);
         momentum2[idx] = v_t;

From 5e5a7843cbab828f78592900dfe31fc5e338e885 Mon Sep 17 00:00:00 2001
From: "Nichols A. Romero" <nick.romero@amd.com>
Date: Thu, 25 Jul 2024 16:09:56 +0000
Subject: [PATCH 3/4] More variables that require initialization in support of
 ROCm 6.2 enablement.

---
 fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
index 46fd14c9e5..02d35af313 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
+++ b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
@@ -1102,6 +1102,8 @@ DEVICE_INLINE T warp_reduce_max(T val) {
 template <typename scalar_t>
 DEVICE_INLINE float2 warp_find_qparams(scalar_t local_min, scalar_t local_max) {
   float2 qparams;
+  qparams.x = 0.0f;
+  qparams.y = 0.0f;
   local_min = warp_reduce_min<scalar_t>(local_min);
   local_max = warp_reduce_max<scalar_t>(local_max);
   if (threadIdx.x == 0) {

From e431139138233a39cd602d1184aa3154d19931bf Mon Sep 17 00:00:00 2001
From: "Nichols A. Romero" <nick.romero@amd.com>
Date: Thu, 25 Jul 2024 19:23:08 +0000
Subject: [PATCH 4/4] Additional variables that require initialization in
 support of ROCm 6.2 enablement.

---
 fbgemm_gpu/codegen/embedding_common_code_generator.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fbgemm_gpu/codegen/embedding_common_code_generator.py b/fbgemm_gpu/codegen/embedding_common_code_generator.py
index c81b680a26..e1d6062794 100644
--- a/fbgemm_gpu/codegen/embedding_common_code_generator.py
+++ b/fbgemm_gpu/codegen/embedding_common_code_generator.py
@@ -1096,6 +1096,8 @@ def rowwise_adagrad_with_counter() -> Dict[str, Any]:
 
     at::acc_type<cache_t, true> adjusted_multiplier;
     at::acc_type<cache_t, true> exp_reg_correction;
+    adjusted_multiplier = 0.0;
+    exp_reg_correction = 0.0;
 
     if (threadIdx.x == 0) {
         at::acc_type<cache_t, true> new_sum_square_grads = momentum1[idx] + g_avg_square;
@@ -1463,6 +1465,7 @@ def partial_rowwise_lamb() -> Dict[str, Any]:
         warpReduceAllSum<at::acc_type<cache_t, true>, kThreadGroupSize>(g_local_sum_square, shfl_sync_mask) / D;
 
     at::acc_type<cache_t, true> m2;
+    m2 = 0.0;
     if (threadIdx.x == 0) {
         m2 = beta2 * momentum2[idx] + (1.0 - beta2) * g_avg_square;
         momentum2[idx] = m2;
@@ -1609,6 +1612,7 @@ def partial_rowwise_adam() -> Dict[str, Any]:
         warpReduceAllSum<at::acc_type<cache_t, true>, kThreadGroupSize>(g_local_sum_square) / D;
 
     at::acc_type<cache_t, true> v_hat_t;
+    v_hat_t = 0.0;
     if (threadIdx.x == 0) {
         at::acc_type<cache_t, true> v_t = momentum2[idx] * beta2 + g_avg_square * (1.0 - beta2);
         momentum2[idx] = v_t;