From 85d49e4b56c169aa3d1162c5486e6c571d0a8822 Mon Sep 17 00:00:00 2001 From: "yuxiao.guo" Date: Fri, 6 Jun 2025 15:00:25 +0800 Subject: [PATCH] Fix a potential buffer size misaligning issue in TMA description of partition attention --- megakernels/demos/latency/scheduler.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/megakernels/demos/latency/scheduler.py b/megakernels/demos/latency/scheduler.py index 1954abed..7afe196c 100644 --- a/megakernels/demos/latency/scheduler.py +++ b/megakernels/demos/latency/scheduler.py @@ -48,7 +48,10 @@ def make_buffer(shape, buffer_dtype=dtype): stacked_params = model.stacked_params - max_attn_partitions = get_sm_count(device) + def align_sm_to_partition_cacheline(x: int) -> int: + return int(math.ceil(x / 16) * 16) + + max_attn_partitions = align_sm_to_partition_cacheline(get_sm_count(device)) barriers = torch.zeros( [ @@ -60,6 +63,7 @@ def make_buffer(shape, buffer_dtype=dtype): device=device, ) + return Globals( # model params qkv_proj_weights=stacked_params.qkv_proj,