Skip to content

Commit

Permalink
Add masked_index_benchmark (#2989)
Browse files Browse the repository at this point in the history
Summary:
X-link: facebookresearch/FBGEMM#82

Pull Request resolved: #2989

This diff adds a benchmark for measuring host-to-device copy
performance using `torch.ops.fbgemm.masked_index_put`. The host buffer
is a UVM buffer (by default it is `malloc+cudaHostRegister`).

Differential Revision: D61284671
  • Loading branch information
sryap authored and facebook-github-bot committed Aug 15, 2024
1 parent 8c3c89b commit 4e318db
Showing 1 changed file with 96 additions and 0 deletions.
96 changes: 96 additions & 0 deletions fbgemm_gpu/bench/ssd_table_batched_embeddings_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -707,5 +707,101 @@ def nbit_ssd(
)


@cli.command()
@click.option("--iters", default=100, help="Number of iterations to benchmark")
@click.option("--num-inserts", default=1024, help="Number of rows to insert")
@click.option("--dim", default=128, help="Width of each row to insert")
@click.option(
"--use-pipeline",
is_flag=True,
default=False,
help="Use a fraction of SMs (using a grid size < the number of SMs)",
)
@click.option(
"--use-malloc-managed",
is_flag=True,
default=False,
help="Use cudaMallocManaged for the host buffer instead of "
"malloc+cudaHostRegister",
)
@click.option(
"--preferred-sms",
default=-1,
help="The preferred number of SMs for the kernel to use when using "
"--use-pipeline. The value is ignored when not using "
"--use-pipeline.",
)
def masked_index_benchmark(
iters: int,
num_inserts: int,
dim: int,
use_pipeline: bool,
use_malloc_managed: bool,
preferred_sms: int,
) -> None:
"""
A benchmark for measuring host-to-device copy performance using
`torch.ops.fbgemm.masked_index_put`. The host buffer is a UVM
buffer (by default it is malloc+cudaHostRegister).
Args:
iters (int): Number of iterations to benchmark
num_inserts (int): Number of rows to insert
dim (int): Width of each row to insert
use_pipeline (bool): Use a fraction of SMs (using a grid size
< the number of SMs)
use_malloc_managed (bool): Use cudaMallocManaged for the host
buffer instead of malloc+cudaHostRegister
preferred_sms (int): The preferred number of SMs for the
kernel to use when use_pipeline=True. The value is ignored
when use_pipeline=False
Returns:
None
"""

# Common configs
dtype = torch.half
device = "cuda"

# Generate requests
values_all = torch.ops.fbgemm.new_unified_tensor(
torch.zeros(1, device=device, dtype=dtype),
[num_inserts * iters, dim],
is_host_mapped=not use_malloc_managed,
)
output = torch.empty(num_inserts, dim, dtype=dtype, device=device)
indices = torch.arange(num_inserts, dtype=torch.long, device=device)
count = torch.as_tensor([indices.numel()], dtype=torch.int, device=device)

requests = []
for it in range(iters):
values = values_all[it * num_inserts : (it + 1) * num_inserts]
requests.append(TBERequest(output, indices, values))

# Run benchmark
time_per_iter = benchmark_requests(
requests,
lambda output, indices, values: torch.ops.fbgemm.masked_index_put(
output,
indices,
values,
count=count,
use_pipeline=use_pipeline,
preferred_sms=preferred_sms,
),
num_warmups=10,
)

# Report performance
buffer_bytes = num_inserts * dim * values_all.element_size()
logging.info(
f"masked_index_benchmark: use_pipeline {use_pipeline}, "
f"Read/write bytes {buffer_bytes} bytes, "
f"BW: {buffer_bytes / time_per_iter / 1.0e9: .2f} GB/s, "
f"Time {time_per_iter * 1.0e6:.0f} us, "
)


if __name__ == "__main__":
cli()

0 comments on commit 4e318db

Please sign in to comment.