Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add masked_index_benchmark #2989

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 103 additions & 0 deletions fbgemm_gpu/bench/ssd_table_batched_embeddings_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -707,5 +707,108 @@ def nbit_ssd(
)


@cli.command()
@click.option("--iters", default=100, help="Number of iterations to benchmark")
@click.option("--num-inserts", default=1024, help="Number of rows to insert")
@click.option("--dim", default=128, help="Width of each row to insert")
@click.option(
"--use-pipeline",
is_flag=True,
default=False,
help="Use a fraction of SMs (using a grid size < the number of SMs)",
)
@click.option(
"--use-malloc-managed",
is_flag=True,
default=False,
help="Use cudaMallocManaged for the host buffer instead of "
"malloc+cudaHostRegister",
)
@click.option(
"--preferred-sms",
default=-1,
help="The preferred number of SMs for the kernel to use when using "
"--use-pipeline. The value is ignored when not using "
"--use-pipeline.",
)
def masked_index_benchmark(
iters: int,
num_inserts: int,
dim: int,
use_pipeline: bool,
use_malloc_managed: bool,
preferred_sms: int,
) -> None:
"""
A benchmark for measuring host-to-device copy performance using
`torch.ops.fbgemm.masked_index_put`. The host buffer is a UVM
buffer (by default it is malloc+cudaHostRegister).

Args:

iters (int): Number of iterations to benchmark

num_inserts (int): Number of rows to insert

dim (int): Width of each row to insert

use_pipeline (bool): Use a fraction of SMs (using a grid size
< the number of SMs)

use_malloc_managed (bool): Use cudaMallocManaged for the host
buffer instead of malloc+cudaHostRegister

preferred_sms (int): The preferred number of SMs for the
kernel to use when use_pipeline=True. The value is ignored
when use_pipeline=False

Returns:

None
"""

# Common configs
dtype = torch.half
device = "cuda"

# Generate requests
values_all = torch.ops.fbgemm.new_unified_tensor(
torch.zeros(1, device=device, dtype=dtype),
[num_inserts * iters, dim],
is_host_mapped=not use_malloc_managed,
)
output = torch.empty(num_inserts, dim, dtype=dtype, device=device)
indices = torch.arange(num_inserts, dtype=torch.long, device=device)
count = torch.as_tensor([indices.numel()], dtype=torch.int, device=device)

requests = []
for it in range(iters):
values = values_all[it * num_inserts : (it + 1) * num_inserts]
requests.append(TBERequest(output, indices, values))

# Run benchmark
time_per_iter = benchmark_requests(
requests,
lambda output, indices, values: torch.ops.fbgemm.masked_index_put(
output,
indices,
values,
count=count,
use_pipeline=use_pipeline,
preferred_sms=preferred_sms,
),
num_warmups=10,
)

# Report performance
buffer_bytes = num_inserts * dim * values_all.element_size()
logging.info(
f"masked_index_benchmark: use_pipeline {use_pipeline}, "
f"Read/write bytes {buffer_bytes} bytes, "
f"BW: {buffer_bytes / time_per_iter / 1.0e9: .2f} GB/s, "
f"Time {time_per_iter * 1.0e6:.0f} us"
)


if __name__ == "__main__":
cli()
Loading