From 45ddc7e066337ddc1e282c1480ac50bc9b78612b Mon Sep 17 00:00:00 2001
From: Sarunya Pumma <sarunya@meta.com>
Date: Wed, 14 Aug 2024 13:15:53 -0700
Subject: [PATCH] Add masked_index_benchmark

Summary:
This diff adds a benchmark for measuring host-to-device copy
performance using `torch.ops.fbgemm.masked_index_put`. The host buffer
is a UVM buffer (by default it is `malloc+cudaHostRegister`).

Differential Revision: D61284671
---
 .../ssd_table_batched_embeddings_benchmark.py | 80 +++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/fbgemm_gpu/bench/ssd_table_batched_embeddings_benchmark.py b/fbgemm_gpu/bench/ssd_table_batched_embeddings_benchmark.py
index b75b8874ab..c2e4d8650d 100644
--- a/fbgemm_gpu/bench/ssd_table_batched_embeddings_benchmark.py
+++ b/fbgemm_gpu/bench/ssd_table_batched_embeddings_benchmark.py
@@ -707,5 +707,85 @@ def nbit_ssd(
     )
 
 
+@cli.command()
+@click.option("--iters", default=100, help="Number of iterations to benchmark")
+@click.option("--num-inserts", default=1024, help="Number of rows to insert")
+@click.option("--dim", default=128, help="Width of each row to insert")
+@click.option(
+    "--use-pipeline",
+    is_flag=True,
+    default=False,
+    help="Use a fraction of SMs (using a grid size < the number of SMs)",
+)
+@click.option(
+    "--use-malloc-managed",
+    is_flag=True,
+    default=False,
+    help="Use cudaMallocManaged for the host buffer instead of "
+    "malloc+cudaHostRegister",
+)
+def masked_index_benchmark(
+    iters: int, num_inserts: int, dim: int, use_pipeline: bool, use_malloc_managed: bool
+) -> None:
+    """
+    A benchmark for measuring host-to-device copy performance using
+    `torch.ops.fbgemm.masked_index_put`. The host buffer is a UVM
+    buffer (by default it is malloc+cudaHostRegister).
+
+    Args:
+        iters (int): Number of iterations to benchmark
+        num_inserts (int):  Number of rows to insert
+        dim (int): Width of each row to insert
+        use_pipeline (bool): Use a fraction of SMs (using a grid size
+            < the number of SMs)
+        use_malloc_managed (bool): Use cudaMallocManaged for the host
+            buffer instead of malloc+cudaHostRegister
+
+    Returns:
+        None
+    """
+
+    # Common configs
+    dtype = torch.half
+    device = "cuda"
+
+    # Generate requests
+    values_all = torch.ops.fbgemm.new_unified_tensor(
+        torch.zeros(1, device=device, dtype=dtype),
+        [num_inserts * iters, dim],
+        is_host_mapped=not use_malloc_managed,
+    )
+    output = torch.empty(num_inserts, dim, dtype=dtype, device=device)
+    indices = torch.arange(num_inserts, dtype=torch.long, device=device)
+    count = torch.as_tensor([indices.numel()], dtype=torch.int, device=device)
+
+    requests = []
+    for it in range(iters):
+        values = values_all[it * num_inserts : (it + 1) * num_inserts]
+        requests.append(TBERequest(output, indices, values))
+
+    # Run benchmark
+    time_per_iter = benchmark_requests(
+        requests,
+        lambda output, indices, values: torch.ops.fbgemm.masked_index_put(
+            output,
+            indices,
+            values,
+            count=count,
+            use_pipeline=use_pipeline,
+        ),
+        num_warmups=10,
+    )
+
+    # Report performance
+    buffer_bytes = num_inserts * dim * values_all.element_size()
+    logging.info(
+        f"masked_index_benchmark: use_pipeline {use_pipeline}, "
+        f"Read/write bytes {buffer_bytes} bytes, "
+        f"BW: {buffer_bytes / time_per_iter / 1.0e9: .2f} GB/s, "
+        f"Time {time_per_iter * 1.0e6:.0f} us, "
+    )
+
+
 if __name__ == "__main__":
     cli()