From dc89682138209bfffe01989f285a6f3ea812774f Mon Sep 17 00:00:00 2001 From: "Nichols A. Romero" Date: Thu, 25 Jul 2024 15:40:41 +0000 Subject: [PATCH 1/4] Add fclang-abi-compat flag to HIP_HCC_FLAGS to match upstream PyTorch. --- fbgemm_gpu/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt index 0d454ff580..4bf31497e1 100644 --- a/fbgemm_gpu/CMakeLists.txt +++ b/fbgemm_gpu/CMakeLists.txt @@ -201,6 +201,7 @@ if(USE_ROCM) list(APPEND HIP_HCC_FLAGS " \"-Wno-#pragma-messages\" " " \"-Wno-#warnings\" " + -fclang-abi-compat=17 -Wno-cuda-compat -Wno-deprecated-declarations -Wno-format From 1c455dd9899f91cea2a0de3a785b640bfed1f16e Mon Sep 17 00:00:00 2001 From: "Nichols A. Romero" Date: Thu, 25 Jul 2024 15:55:51 +0000 Subject: [PATCH 2/4] Initialize variables in support of ROCm 6.2 enablement. --- fbgemm_gpu/codegen/genscript/optimizers.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fbgemm_gpu/codegen/genscript/optimizers.py b/fbgemm_gpu/codegen/genscript/optimizers.py index b056c3c1ba..bd758fe76c 100644 --- a/fbgemm_gpu/codegen/genscript/optimizers.py +++ b/fbgemm_gpu/codegen/genscript/optimizers.py @@ -175,6 +175,8 @@ def rowwise_adagrad() -> Dict[str, Any]: at::acc_type multiplier; at::acc_type correction; + multiplier = 0.0; + correction = 0.0; if (threadIdx.x == 0) { at::acc_type new_sum_square_grads = momentum1[idx] + g_avg_square; momentum1[idx] = new_sum_square_grads; @@ -485,6 +487,8 @@ def rowwise_adagrad_with_counter() -> Dict[str, Any]: at::acc_type adjusted_multiplier; at::acc_type exp_reg_correction; + adjusted_multiplier = 0.0; + exp_reg_correction = 0.0; if (threadIdx.x == 0) { at::acc_type new_sum_square_grads = momentum1[idx] + g_avg_square; @@ -852,6 +856,7 @@ def partial_rowwise_lamb() -> Dict[str, Any]: warpReduceAllSum, kThreadGroupSize>(g_local_sum_square, shfl_sync_mask) / D; at::acc_type m2; + m2 = 0.0; if (threadIdx.x == 0) { m2 = beta2 * momentum2[idx] + (1.0 - beta2) * g_avg_square; momentum2[idx] = m2; @@ -998,6 +1003,7 @@ def partial_rowwise_adam() -> Dict[str, Any]: warpReduceAllSum, kThreadGroupSize>(g_local_sum_square) / D; at::acc_type v_hat_t; + v_hat_t = 0.0; if (threadIdx.x == 0) { at::acc_type v_t = momentum2[idx] * beta2 + g_avg_square * (1.0 - beta2); momentum2[idx] = v_t; From 5e5a7843cbab828f78592900dfe31fc5e338e885 Mon Sep 17 00:00:00 2001 From: "Nichols A. Romero" Date: Thu, 25 Jul 2024 16:09:56 +0000 Subject: [PATCH 3/4] More variables that require initialization in support of ROCm 6.2 enablement. --- fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh index 46fd14c9e5..02d35af313 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh +++ b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh @@ -1102,6 +1102,8 @@ DEVICE_INLINE T warp_reduce_max(T val) { template DEVICE_INLINE float2 warp_find_qparams(scalar_t local_min, scalar_t local_max) { float2 qparams; + qparams.x = 0.0f; + qparams.y = 0.0f; local_min = warp_reduce_min(local_min); local_max = warp_reduce_max(local_max); if (threadIdx.x == 0) { From e431139138233a39cd602d1184aa3154d19931bf Mon Sep 17 00:00:00 2001 From: "Nichols A. Romero" Date: Thu, 25 Jul 2024 19:23:08 +0000 Subject: [PATCH 4/4] Additional variables that require initialization in support of ROCm 6.2 enablement. --- fbgemm_gpu/codegen/embedding_common_code_generator.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fbgemm_gpu/codegen/embedding_common_code_generator.py b/fbgemm_gpu/codegen/embedding_common_code_generator.py index c81b680a26..e1d6062794 100644 --- a/fbgemm_gpu/codegen/embedding_common_code_generator.py +++ b/fbgemm_gpu/codegen/embedding_common_code_generator.py @@ -1096,6 +1096,8 @@ def rowwise_adagrad_with_counter() -> Dict[str, Any]: at::acc_type adjusted_multiplier; at::acc_type exp_reg_correction; + adjusted_multiplier = 0.0; + exp_reg_correction = 0.0; if (threadIdx.x == 0) { at::acc_type new_sum_square_grads = momentum1[idx] + g_avg_square; @@ -1463,6 +1465,7 @@ def partial_rowwise_lamb() -> Dict[str, Any]: warpReduceAllSum, kThreadGroupSize>(g_local_sum_square, shfl_sync_mask) / D; at::acc_type m2; + m2 = 0.0; if (threadIdx.x == 0) { m2 = beta2 * momentum2[idx] + (1.0 - beta2) * g_avg_square; momentum2[idx] = m2; @@ -1609,6 +1612,7 @@ def partial_rowwise_adam() -> Dict[str, Any]: warpReduceAllSum, kThreadGroupSize>(g_local_sum_square) / D; at::acc_type v_hat_t; + v_hat_t = 0.0; if (threadIdx.x == 0) { at::acc_type v_t = momentum2[idx] * beta2 + g_avg_square * (1.0 - beta2); momentum2[idx] = v_t;