From 342210b4c2dbb01d244b204474c6119f14da68b1 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Thu, 30 Jan 2025 10:36:22 -0800 Subject: [PATCH] fix --- csrc/device_lower/pass/allocation.cpp | 56 ++++++++++++++------------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/csrc/device_lower/pass/allocation.cpp b/csrc/device_lower/pass/allocation.cpp index 0cf9097a971..b81945e93ed 100644 --- a/csrc/device_lower/pass/allocation.cpp +++ b/csrc/device_lower/pass/allocation.cpp @@ -837,33 +837,35 @@ std::vector insertTMemRegionAllocsAndDeallocs( // Expressions to be inserted at the beginning of the top-level scope. std::list prologue; { - // Allocate the address tensor - auto allocation_address = - GpuLower::current()->tmemInfo().allocation_address; - auto address_alloc_expr = IrBuilder::create( - allocation_address, MemoryType::Shared); - prologue.push_back(address_alloc_expr); - - // the tcgen05.alloc instructions - auto alloc_expr = IrBuilder::create( - allocation_address, - IrBuilder::create( - 32, - DataType::UInt32) // TODO: hard code allocation size to 32 for now - ); - prologue.push_back(alloc_expr); - - // Relinquish the right to allocate after we are done with tcgen05.allocs - auto tcgen05_relinquish_expr = IrBuilder::create( - "tcgen05.relinquish_alloc_permit.cta_group::1.sync.aligned", - std::vector{}, - std::vector{}, - kir::Asm::Options{/*volatile=*/true}); - prologue.push_back(tcgen05_relinquish_expr); - - // Block sync that makes allocation visible to all threads - auto block_sync = IrBuilder::create(); - prologue.push_back(block_sync); + if (GpuLower::current()->tmemInfo().allocation_address != nullptr) { + // Allocate the address tensor + auto allocation_address = + GpuLower::current()->tmemInfo().allocation_address; + auto address_alloc_expr = IrBuilder::create( + allocation_address, MemoryType::Shared); + prologue.push_back(address_alloc_expr); + + // the tcgen05.alloc instructions + auto alloc_expr = IrBuilder::create( + allocation_address, + IrBuilder::create( + 32, + DataType::UInt32) // TODO: hard code allocation size to 32 for now + ); + prologue.push_back(alloc_expr); + + // Relinquish the right to allocate after we are done with tcgen05.allocs + auto tcgen05_relinquish_expr = IrBuilder::create( + "tcgen05.relinquish_alloc_permit.cta_group::1.sync.aligned", + std::vector{}, + std::vector{}, + kir::Asm::Options{/*volatile=*/true}); + prologue.push_back(tcgen05_relinquish_expr); + + // Block sync that makes allocation visible to all threads + auto block_sync = IrBuilder::create(); + prologue.push_back(block_sync); + } } // Combine prologue and exprs