Reduce prefetch SM usage when using pipeline prefetching

sarunya · facebook-github-bot · commit f8606eb1e7c2 · 2024-08-14T12:26:40.000-07:00
Differential Revision: D61145930
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
@@ -1046,6 +1046,7 @@ def prefetch(
                         sp_prev_curr_map_gpu,
                         inserted_rows_prev,
                         actions_count_gpu,
+                        use_pipeline=self.prefetch_pipeline,
                     )
 
                     # Record the tensors that will be pushed into a queue
@@ -1087,6 +1088,7 @@ def prefetch(
                 assigned_cache_slots,
                 inserted_rows,
                 actions_count_gpu,
+                use_pipeline=self.prefetch_pipeline,
             )
 
             if linear_cache_indices.numel() > 0:
diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_embeddings_cache_cuda.cu b/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_embeddings_cache_cuda.cu
@@ -22,6 +22,8 @@
 #include "fbgemm_gpu/utils/tensor_accessor.h"
 #include "fbgemm_gpu/utils/vec4.cuh"
 
+constexpr int ALL_TO_PREFETCH_SM_RATIO = 8;
+
 using Tensor = at::Tensor;
 
 using namespace fbgemm_gpu;
@@ -59,31 +61,29 @@ __global__ __launch_bounds__(kMaxThreads) void masked_index_kernel(
     const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
         count) {
   const int32_t N = indices.size(0);
-  const int32_t n = blockIdx.x * blockDim.y + threadIdx.y;
-  if (n >= N) {
-    return;
-  }
   const auto count_ = count[0];
-  if (n >= count_) {
-    return;
-  }
-  // idx == -1 if it is conflict miss
-  const auto idx = indices[n];
-  if (idx < 0) {
-    return;
+  CUDA_KERNEL_ASSERT(count_ <= N);
+  for (int32_t n = blockIdx.x * blockDim.y + threadIdx.y; n < count_;
+       n += blockDim.y * gridDim.x) {
+    // idx == -1 if it is conflict miss
+    const auto idx = indices[n];
+    if (idx < 0) {
+      continue;
+    }
+    const auto D = self.size(1);
+    const auto self_idx = is_index_put ? idx : n;
+    const auto values_idx = is_index_put ? n : idx;
+    vec4_copy(&self[self_idx][0], &values[values_idx][0], D);
   }
-  const auto D = self.size(1);
-  const auto self_idx = is_index_put ? idx : n;
-  const auto values_idx = is_index_put ? n : idx;
-  vec4_copy(&self[self_idx][0], &values[values_idx][0], D);
 }
 
 template <bool is_index_put>
 Tensor masked_index_impl(
     const Tensor& self,
     const Tensor& indices,
     const Tensor& values,
-    const Tensor& count) {
+    const Tensor& count,
+    const bool use_pipeline) {
   TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(self, indices, values, count);
   TENSOR_CONTIGUOUS(self);
   TENSOR_CONTIGUOUS(indices);
@@ -98,12 +98,20 @@ Tensor masked_index_impl(
   const auto D = self.size(1);
   TORCH_CHECK_EQ(self.size(1), values.size(1));
 
+  const int32_t tx = std::min<int32_t>(D / 4, kMaxThreads);
+  const dim3 threads(tx, kMaxThreads / tx);
+
+  const auto full_grid_size = div_round_up(N, kMaxThreads / tx);
+
+  // Use a fraction of SMs if use_pipeline=true
+  const auto grid_size = use_pipeline
+      ? std::min(div_round_up(get_device_sm_cnt_(), ALL_TO_PREFETCH_SM_RATIO), full_grid_size)
+      : full_grid_size;
+
   FBGEMM_DISPATCH_FLOAT_HALF_AND_BYTE(
       self.scalar_type(),
       is_index_put ? "masked_index_put" : "masked_index_select",
       [&] {
-        const int32_t tx = std::min<int32_t>(D / 4, kMaxThreads);
-        const dim3 threads(tx, kMaxThreads / tx);
 #ifdef FBGEMM_GPU_MEMCHECK
         const auto func_name = is_index_put ? "masked_index_put_kernel"
                                             : "masked_index_select_kernel";
@@ -112,7 +120,7 @@ Tensor masked_index_impl(
           TORCH_CHECK(D % 16 == 0, "D needs to be padded to be multiple of 16")
         }
         masked_index_kernel<scalar_t, is_index_put>
-            <<<div_round_up(N, kMaxThreads / tx),
+            <<<grid_size,
                dim3(tx, kMaxThreads / tx),
                0,
                at::cuda::getCurrentCUDAStream()>>>(
@@ -131,17 +139,20 @@ Tensor masked_index_put_cuda(
     Tensor self,
     Tensor indices,
     Tensor values,
-    Tensor count) {
-  return masked_index_impl</*is_index_put=*/true>(self, indices, values, count);
+    Tensor count,
+    const bool use_pipeline) {
+  return masked_index_impl</*is_index_put=*/true>(
+      self, indices, values, count, use_pipeline);
 }
 
 Tensor masked_index_select_cuda(
     Tensor self,
     Tensor indices,
     Tensor values,
-    Tensor count) {
+    Tensor count,
+    const bool use_pipeline) {
   return masked_index_impl</*is_index_put=*/false>(
-      self, indices, values, count);
+      self, indices, values, count, use_pipeline);
 }
 
 __global__ __launch_bounds__(kMaxThreads) void ssd_cache_actions_insert_kernel(
diff --git a/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_table_batched_embeddings.cpp b/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_table_batched_embeddings.cpp
@@ -50,11 +50,18 @@ ssd_cache_populate_actions_cuda(
 /// @param indices The 1D index tensor
 /// @param values The 2D input tensor
 /// @param count The tensor that contains the length of `indices` to
-/// process
+///            process
+/// @param use_pipeline A flag that indicates that this kernel will
+///            overlap with other kernels. If it is true, then use a
+///            fraction of SMs to reduce resource competition
 ///
 /// @return The `self` tensor
-Tensor
-masked_index_put_cuda(Tensor self, Tensor indices, Tensor values, Tensor count);
+Tensor masked_index_put_cuda(
+    Tensor self,
+    Tensor indices,
+    Tensor values,
+    Tensor count,
+    const bool use_pipeline);
 
 /// @ingroup embedding-ssd
 ///
@@ -76,14 +83,18 @@ masked_index_put_cuda(Tensor self, Tensor indices, Tensor values, Tensor count);
 /// @param indices The 1D index tensor
 /// @param values The 2D input tensor (the tensor that is indexed)
 /// @param count The tensor that contains the length of `indices` to
-/// process
+///            process
+/// @param use_pipeline A flag that indicates that this kernel will
+///            overlap with other kernels. If it is true, then use a
+///            fraction of SMs to reduce resource competition
 ///
 /// @return The `self` tensor
 Tensor masked_index_select_cuda(
     Tensor self,
     Tensor indices,
     Tensor values,
-    Tensor count);
+    Tensor count,
+    const bool use_pipeline);
 
 Tensor masked_index_put_byte_cuda(
     Tensor self,
@@ -330,15 +341,17 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
       "    Tensor self, "
       "    Tensor indices, "
       "    Tensor values, "
-      "    Tensor count"
+      "    Tensor count, "
+      "    bool use_pipeline=False"
       ") -> Tensor");
   DISPATCH_TO_CUDA("masked_index_put", masked_index_put_cuda);
   m.def(
       "masked_index_select("
       "    Tensor self, "
       "    Tensor indices, "
       "    Tensor values, "
-      "    Tensor count"
+      "    Tensor count, "
+      "    bool use_pipeline=False"
       ") -> Tensor");
   DISPATCH_TO_CUDA("masked_index_select", masked_index_select_cuda);
   m.def(
diff --git a/fbgemm_gpu/test/tbe/ssd/ssd_utils_test.py b/fbgemm_gpu/test/tbe/ssd/ssd_utils_test.py
@@ -41,9 +41,10 @@ def execute_masked_index_test(
         num_output_rows: int,
         dtype: torch.dtype,
         test_fn: Callable[
-            [torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor
+            [torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, bool], torch.Tensor
         ],
         is_index_put: bool,
+        use_pipeline: bool,
     ) -> None:
         """
         A helper function that generates inputs/outputs, runs
@@ -83,7 +84,7 @@ def execute_masked_index_test(
         output_ref = torch.zeros(num_output_rows, D, dtype=dtype, device=device)
 
         # Run test
-        output = test_fn(output, indices, values, count)
+        output = test_fn(output, indices, values, count, use_pipeline)
 
         # Run reference
         indices = indices[:count_val]
@@ -104,6 +105,7 @@ def execute_masked_index_test(
         D=st.integers(min_value=2, max_value=256),
         num_output_rows=st.integers(min_value=10, max_value=100),
         dtype=st.sampled_from([torch.float, torch.half]),
+        use_pipeline=st.booleans(),
     )
     @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
     def test_masked_index_put(
@@ -112,6 +114,7 @@ def test_masked_index_put(
         D: int,
         num_output_rows: int,
         dtype: torch.dtype,
+        use_pipeline: bool,
     ) -> None:
         """
         Test correctness of torch.ops.fbgemm.masked_index_put against PyTorch's
@@ -126,6 +129,7 @@ def test_masked_index_put(
             dtype=dtype,
             test_fn=torch.ops.fbgemm.masked_index_put,
             is_index_put=True,
+            use_pipeline=use_pipeline,
         )
 
     # pyre-ignore [56]
@@ -134,6 +138,7 @@ def test_masked_index_put(
         D=st.integers(min_value=2, max_value=256),
         num_value_rows=st.integers(min_value=10, max_value=100),
         dtype=st.sampled_from([torch.float, torch.half]),
+        use_pipeline=st.booleans(),
     )
     @settings(verbosity=Verbosity.verbose, max_examples=MAX_EXAMPLES, deadline=None)
     def test_masked_index_select(
@@ -142,6 +147,7 @@ def test_masked_index_select(
         D: int,
         num_value_rows: int,
         dtype: torch.dtype,
+        use_pipeline: bool,
     ) -> None:
         """
         Test correctness of torch.ops.fbgemm.masked_index_select aginst
@@ -156,6 +162,7 @@ def test_masked_index_select(
             dtype=dtype,
             test_fn=torch.ops.fbgemm.masked_index_select,
             is_index_put=False,
+            use_pipeline=use_pipeline,
         )
 
     def expand_tensor(

Original file line number	Diff line number	Diff line change
`@@ -1046,6 +1046,7 @@ def prefetch(`
`1046`	`1046`	`sp_prev_curr_map_gpu,`
`1047`	`1047`	`inserted_rows_prev,`
`1048`	`1048`	`actions_count_gpu,`
	`1049`	`+ use_pipeline=self.prefetch_pipeline,`
`1049`	`1050`	`)`
`1050`	`1051`
`1051`	`1052`	`# Record the tensors that will be pushed into a queue`
`@@ -1087,6 +1088,7 @@ def prefetch(`
`1087`	`1088`	`assigned_cache_slots,`
`1088`	`1089`	`inserted_rows,`
`1089`	`1090`	`actions_count_gpu,`
	`1091`	`+ use_pipeline=self.prefetch_pipeline,`
`1090`	`1092`	`)`
`1091`	`1093`
`1092`	`1094`	`if linear_cache_indices.numel() > 0:`