From 45ddc7e066337ddc1e282c1480ac50bc9b78612b Mon Sep 17 00:00:00 2001 From: Sarunya Pumma Date: Wed, 14 Aug 2024 13:15:53 -0700 Subject: [PATCH] Add masked_index_benchmark Summary: This diff adds a benchmark for measuring host-to-device copy performance using `torch.ops.fbgemm.masked_index_put`. The host buffer is a UVM buffer (by default it is `malloc+cudaHostRegister`). Differential Revision: D61284671 --- .../ssd_table_batched_embeddings_benchmark.py | 80 +++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/fbgemm_gpu/bench/ssd_table_batched_embeddings_benchmark.py b/fbgemm_gpu/bench/ssd_table_batched_embeddings_benchmark.py index b75b8874ab..c2e4d8650d 100644 --- a/fbgemm_gpu/bench/ssd_table_batched_embeddings_benchmark.py +++ b/fbgemm_gpu/bench/ssd_table_batched_embeddings_benchmark.py @@ -707,5 +707,85 @@ def nbit_ssd( ) +@cli.command() +@click.option("--iters", default=100, help="Number of iterations to benchmark") +@click.option("--num-inserts", default=1024, help="Number of rows to insert") +@click.option("--dim", default=128, help="Width of each row to insert") +@click.option( + "--use-pipeline", + is_flag=True, + default=False, + help="Use a fraction of SMs (using a grid size < the number of SMs)", +) +@click.option( + "--use-malloc-managed", + is_flag=True, + default=False, + help="Use cudaMallocManaged for the host buffer instead of " + "malloc+cudaHostRegister", +) +def masked_index_benchmark( + iters: int, num_inserts: int, dim: int, use_pipeline: bool, use_malloc_managed: bool +) -> None: + """ + A benchmark for measuring host-to-device copy performance using + `torch.ops.fbgemm.masked_index_put`. The host buffer is a UVM + buffer (by default it is malloc+cudaHostRegister). + + Args: + iters (int): Number of iterations to benchmark + num_inserts (int): Number of rows to insert + dim (int): Width of each row to insert + use_pipeline (bool): Use a fraction of SMs (using a grid size + < the number of SMs) + use_malloc_managed (bool): Use cudaMallocManaged for the host + buffer instead of malloc+cudaHostRegister + + Returns: + None + """ + + # Common configs + dtype = torch.half + device = "cuda" + + # Generate requests + values_all = torch.ops.fbgemm.new_unified_tensor( + torch.zeros(1, device=device, dtype=dtype), + [num_inserts * iters, dim], + is_host_mapped=not use_malloc_managed, + ) + output = torch.empty(num_inserts, dim, dtype=dtype, device=device) + indices = torch.arange(num_inserts, dtype=torch.long, device=device) + count = torch.as_tensor([indices.numel()], dtype=torch.int, device=device) + + requests = [] + for it in range(iters): + values = values_all[it * num_inserts : (it + 1) * num_inserts] + requests.append(TBERequest(output, indices, values)) + + # Run benchmark + time_per_iter = benchmark_requests( + requests, + lambda output, indices, values: torch.ops.fbgemm.masked_index_put( + output, + indices, + values, + count=count, + use_pipeline=use_pipeline, + ), + num_warmups=10, + ) + + # Report performance + buffer_bytes = num_inserts * dim * values_all.element_size() + logging.info( + f"masked_index_benchmark: use_pipeline {use_pipeline}, " + f"Read/write bytes {buffer_bytes} bytes, " + f"BW: {buffer_bytes / time_per_iter / 1.0e9: .2f} GB/s, " + f"Time {time_per_iter * 1.0e6:.0f} us, " + ) + + if __name__ == "__main__": cli()