pytorch · sryap · Aug 16, 2024
diff --git a/fbgemm_gpu/bench/ssd_table_batched_embeddings_benchmark.py b/fbgemm_gpu/bench/ssd_table_batched_embeddings_benchmark.py
@@ -707,5 +707,108 @@ def nbit_ssd(
     )
 
 
+@cli.command()
+@click.option("--iters", default=100, help="Number of iterations to benchmark")
+@click.option("--num-inserts", default=1024, help="Number of rows to insert")
+@click.option("--dim", default=128, help="Width of each row to insert")
+@click.option(
+    "--use-pipeline",
+    is_flag=True,
+    default=False,
+    help="Use a fraction of SMs (using a grid size < the number of SMs)",
+)
+@click.option(
+    "--use-malloc-managed",
+    is_flag=True,
+    default=False,
+    help="Use cudaMallocManaged for the host buffer instead of "
+    "malloc+cudaHostRegister",
+)
+@click.option(
+    "--preferred-sms",
+    default=-1,
+    help="The preferred number of SMs for the kernel to use when using "
+    "--use-pipeline. The value is ignored when not using "
+    "--use-pipeline.",
+)
+def masked_index_benchmark(
+    iters: int,
+    num_inserts: int,
+    dim: int,
+    use_pipeline: bool,
+    use_malloc_managed: bool,
+    preferred_sms: int,
+) -> None:
+    """
+    A benchmark for measuring host-to-device copy performance using
+    `torch.ops.fbgemm.masked_index_put`. The host buffer is a UVM
+    buffer (by default it is malloc+cudaHostRegister).
+
+    Args:
+
+        iters (int): Number of iterations to benchmark
+
+        num_inserts (int):  Number of rows to insert
+
+        dim (int): Width of each row to insert
+
+        use_pipeline (bool): Use a fraction of SMs (using a grid size
+        < the number of SMs)
+
+        use_malloc_managed (bool): Use cudaMallocManaged for the host
+        buffer instead of malloc+cudaHostRegister
+
+        preferred_sms (int): The preferred number of SMs for the
+        kernel to use when use_pipeline=True. The value is ignored
+        when use_pipeline=False
+
+    Returns:
+
+        None
+    """
+
+    # Common configs
+    dtype = torch.half
+    device = "cuda"
+
+    # Generate requests
+    values_all = torch.ops.fbgemm.new_unified_tensor(
+        torch.zeros(1, device=device, dtype=dtype),
+        [num_inserts * iters, dim],
+        is_host_mapped=not use_malloc_managed,
+    )
+    output = torch.empty(num_inserts, dim, dtype=dtype, device=device)
+    indices = torch.arange(num_inserts, dtype=torch.long, device=device)
+    count = torch.as_tensor([indices.numel()], dtype=torch.int, device=device)
+
+    requests = []
+    for it in range(iters):
+        values = values_all[it * num_inserts : (it + 1) * num_inserts]
+        requests.append(TBERequest(output, indices, values))
+
+    # Run benchmark
+    time_per_iter = benchmark_requests(
+        requests,
+        lambda output, indices, values: torch.ops.fbgemm.masked_index_put(
+            output,
+            indices,
+            values,
+            count=count,
+            use_pipeline=use_pipeline,
+            preferred_sms=preferred_sms,
+        ),
+        num_warmups=10,
+    )
+
+    # Report performance
+    buffer_bytes = num_inserts * dim * values_all.element_size()
+    logging.info(
+        f"masked_index_benchmark: use_pipeline {use_pipeline}, "
+        f"Read/write bytes {buffer_bytes} bytes, "
+        f"BW: {buffer_bytes / time_per_iter / 1.0e9: .2f} GB/s, "
+        f"Time {time_per_iter * 1.0e6:.0f} us"
+    )
+
+
 if __name__ == "__main__":
     cli()