Skip to content

Commit

Permalink
Add masked_index_benchmark
Browse files Browse the repository at this point in the history
Summary:
This diff adds a benchmark for measuring host-to-device copy
performance using `torch.ops.fbgemm.masked_index_put`. The host buffer
is a UVM buffer (by default it is `malloc+cudaHostRegister`).

Differential Revision: D61284671
  • Loading branch information
sryap authored and facebook-github-bot committed Aug 14, 2024
1 parent f8606eb commit 45ddc7e
Showing 1 changed file with 80 additions and 0 deletions.
80 changes: 80 additions & 0 deletions fbgemm_gpu/bench/ssd_table_batched_embeddings_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -707,5 +707,85 @@ def nbit_ssd(
)


@cli.command()
@click.option("--iters", default=100, help="Number of iterations to benchmark")
@click.option("--num-inserts", default=1024, help="Number of rows to insert")
@click.option("--dim", default=128, help="Width of each row to insert")
@click.option(
"--use-pipeline",
is_flag=True,
default=False,
help="Use a fraction of SMs (using a grid size < the number of SMs)",
)
@click.option(
"--use-malloc-managed",
is_flag=True,
default=False,
help="Use cudaMallocManaged for the host buffer instead of "
"malloc+cudaHostRegister",
)
def masked_index_benchmark(
iters: int, num_inserts: int, dim: int, use_pipeline: bool, use_malloc_managed: bool
) -> None:
"""
A benchmark for measuring host-to-device copy performance using
`torch.ops.fbgemm.masked_index_put`. The host buffer is a UVM
buffer (by default it is malloc+cudaHostRegister).
Args:
iters (int): Number of iterations to benchmark
num_inserts (int): Number of rows to insert
dim (int): Width of each row to insert
use_pipeline (bool): Use a fraction of SMs (using a grid size
< the number of SMs)
use_malloc_managed (bool): Use cudaMallocManaged for the host
buffer instead of malloc+cudaHostRegister
Returns:
None
"""

# Common configs
dtype = torch.half
device = "cuda"

# Generate requests
values_all = torch.ops.fbgemm.new_unified_tensor(
torch.zeros(1, device=device, dtype=dtype),
[num_inserts * iters, dim],
is_host_mapped=not use_malloc_managed,
)
output = torch.empty(num_inserts, dim, dtype=dtype, device=device)
indices = torch.arange(num_inserts, dtype=torch.long, device=device)
count = torch.as_tensor([indices.numel()], dtype=torch.int, device=device)

requests = []
for it in range(iters):
values = values_all[it * num_inserts : (it + 1) * num_inserts]
requests.append(TBERequest(output, indices, values))

# Run benchmark
time_per_iter = benchmark_requests(
requests,
lambda output, indices, values: torch.ops.fbgemm.masked_index_put(
output,
indices,
values,
count=count,
use_pipeline=use_pipeline,
),
num_warmups=10,
)

# Report performance
buffer_bytes = num_inserts * dim * values_all.element_size()
logging.info(
f"masked_index_benchmark: use_pipeline {use_pipeline}, "
f"Read/write bytes {buffer_bytes} bytes, "
f"BW: {buffer_bytes / time_per_iter / 1.0e9: .2f} GB/s, "
f"Time {time_per_iter * 1.0e6:.0f} us, "
)


if __name__ == "__main__":
cli()

0 comments on commit 45ddc7e

Please sign in to comment.