NVIDIA · zasdfgbnm · Jan 24, 2025 · Jan 24, 2025 · Jan 24, 2025 · Jan 25, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -95,6 +95,7 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/device_lower/analysis/index_compute.cpp
   ${NVFUSER_SRCS_DIR}/device_lower/analysis/predicate_elimination.cpp
   ${NVFUSER_SRCS_DIR}/device_lower/analysis/sync_information.cpp
+  ${NVFUSER_SRCS_DIR}/device_lower/analysis/tensor_memory.cpp
   ${NVFUSER_SRCS_DIR}/device_lower/analysis/thread_predicate.cpp
   ${NVFUSER_SRCS_DIR}/device_lower/analysis/tma.cpp
   ${NVFUSER_SRCS_DIR}/device_lower/analysis/trivial_broadcast.cpp
@@ -826,6 +827,7 @@ list(APPEND NVFUSER_RUNTIME_FILES
   ${NVFUSER_ROOT}/runtime/mbarrier.cu
   ${NVFUSER_ROOT}/runtime/memory.cu
   ${NVFUSER_ROOT}/runtime/random_numbers.cu
+  ${NVFUSER_ROOT}/runtime/tensor_memory.cu
   ${NVFUSER_ROOT}/runtime/tensor.cu
   ${NVFUSER_ROOT}/runtime/tuple.cu
   ${NVFUSER_ROOT}/runtime/type_traits.cu

diff --git a/csrc/codegen.cpp b/csrc/codegen.cpp
@@ -615,7 +615,7 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
           value);
       auto atype = std::get<ArrayType>(dtype.type);
       auto dims = static_cast<int64_t>(value.as<std::vector>().size());
-      code_ << "{ ";
+      code_ << "{";
       for (auto i = 0; i < dims; i++) {
         if (i > 0) {
           code_ << ", ";
@@ -681,6 +681,12 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
       return;
     }
 
+    if (ti->view()->getMemoryType() == MemoryType::Tensor) {
+      code_ << "(uint32_t)(" << genVariableName(ti->view()) << " + "
+            << genInline(ti->index()) << ")";
+      return;
+    }
+
     if (ti->view()->getMemoryType() == MemoryType::Global &&
         kernel_->summary().sync_map->needsRawSync(ti->view()).hasBID()) {
       code_ << "*(volatile " << ti->getDataType().value() << "*)&";
@@ -3178,7 +3184,15 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
             indent() << buffer_dtype << " " << genVariableName(tv) << "["
                      << genInline(size) << "];\n";
           }
-        } break;
+          break;
+        }
+        case MemoryType::Tensor: {
+          indent() << "TMemTensor " << genVariableName(tv) << "("
+                   << genInline(alloc->baseAddress()) << ", "
+                   << genInline(alloc->laneOffset()) << ", "
+                   << genInline(alloc->colOffset()) << ");\n";
+          break;
+        }
         default:
           NVF_THROW("Unexpected memory type");
       }

diff --git a/csrc/device_lower/analysis/predicate_elimination.cpp b/csrc/device_lower/analysis/predicate_elimination.cpp
@@ -571,11 +571,11 @@ class PredicateChcker : public IterVisitor {
   // For details on zero loops, see indexMapFromTV in
   //  lower index pass.
   std::vector<Val*> getZeroLeafIds(const TensorView* tv) const {
-    NVF_ERROR(
-        tv->getMemoryType() == MemoryType::Local ||
-            tv->getMemoryType() == MemoryType::Shared,
-        "Local or shared memory tensor is assumed: ",
-        tv->toString());
+    // NVF_ERROR(
+    //     tv->getMemoryType() == MemoryType::Local ||
+    //         tv->getMemoryType() == MemoryType::Shared,
+    //     "Local or shared memory tensor is assumed: ",
+    //     tv->toString());
     bool is_shared_mem = tv->getMemoryType() == MemoryType::Shared;
     std::vector<Val*> zero_loop_ids;
     for (const auto i : c10::irange(tv->nDims())) {

diff --git a/csrc/device_lower/analysis/tensor_memory.cpp b/csrc/device_lower/analysis/tensor_memory.cpp
@@ -0,0 +1,50 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+
+#include <device_lower/analysis/tensor_memory.h>
+#include <fusion.h>
+#include <ir/all_nodes.h>
+#include <type.h>
+
+namespace nvfuser {
+
+TensorMemoryInfo computeTMemInfo(Fusion* fusion) {
+  TensorMemoryInfo result;
+
+  // Compute the allocation information for tensor memory. Currently, we use a
+  // very simple heuristic that assign a separate region for each TensorView.
+  // See note [Tensor Memory Allocation] for the overall design.
+  auto& regions = result.allocation.regions;
+  for (auto tv : fusion->allTvs()) {
+    if (tv->getMemoryType() != MemoryType::Tensor) {
+      continue;
+    }
+    regions.emplace_back();
+    auto& region = regions.back();
+
+    region.address = TensorViewBuilder()
+                         .shape(std::vector<Val*>{})
+                         .dtype(DataType::UInt32)
+                         .build();
+    region.address->setMemoryType(MemoryType::Shared);
+
+    // TODO: right now we hardcode the number of columns to be 32. This is
+    // definitely not correct.
+    region.num_columns = IrBuilder::create<Val>(32, DataType::UInt32);
+
+    region.covered_tensors.emplace_back();
+    auto& covered_tensor = region.covered_tensors.back();
+    covered_tensor.tensor = tv;
+    covered_tensor.lane_offset = tv->fusion()->zeroVal(DataType::UInt16);
+    covered_tensor.column_offset = tv->fusion()->zeroVal(DataType::UInt16);
+  }
+
+  return result;
+}
+
+} // namespace nvfuser
diff --git a/csrc/device_lower/analysis/tensor_memory.h b/csrc/device_lower/analysis/tensor_memory.h
@@ -0,0 +1,142 @@
+// clang-format off
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+// clang-format on
+#pragma once
+
+#include <vector>
+
+namespace nvfuser {
+
+class Val;
+class TensorView;
+class Fusion;
+
+// Information used to lower tensor memory. So far, it is just about allocation.
+struct TensorMemoryInfo;
+TensorMemoryInfo computeTMemInfo(Fusion* fusion);
+
+// Note: [Tensor Memory Allocation]
+//
+// Tensor memory is a very special memory, so its allocation is also very
+// different from other memory types.
+//
+// It is highly recommended to read the PTX documentation for tensor memory
+// if you are not alreay familiar with it:
+// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#tensor-memory
+//
+// The first thing to note is, TMem does not have virtualization. This means:
+// We can not just allocate starting from address 0 like how we allocate shared
+// memory, and rely on page table to translate the same virtual address of
+// different CTA to different physical address. There is no virtual TMem
+// address. All addresses are physical addresses.
+//
+// Because multiple CTAs can execute on the same SM simultaneously, there must
+// be some handshaking mechanism for each CTA to know the region of TMem that it
+// can use. This is done by using the PTX instruction tcgen05.alloc.
+//
+// The tcgen05.alloc instruction is like the following:
+//   tcgen05.alloc [dest], nCols
+//
+// There are three important things to note about this instruction:
+//
+// 1. The output of this instruction is in shared memory address.
+// 2. The unit of allocation is 32 whole columns of tensor memory. And nCols
+//    must be a power of two.
+// 3. The right to allocate is like a mutex and will serialize CTA scheduling.
+//    The tcgen05.alloc is blocking when there is no space to allocate.
+//
+// The point 1 above is not a big trouble for us, but we need to make sure we
+// allocate the address tensor in shared memory before allocating the tensor
+// memory. But the point 2 and 3 can be a big challenge. There are basically
+// two things to worry about when allocating tensor memory:
+//
+// 1. Fragmentation. When the tensor does not occupy all lanes or the tensor's
+// size is not a power of two columns or < 32 columns, naively allocating all
+// lanes with 32 or higher power of 2 columns could waste some space. In a
+// perfect world, it would be nice to have a 2D allocator that is capable
+// merging the allocation of multiple tensors into a single tcgen05.alloc.
+// For example, if tv0 and tv2 both has 64 rows and 32 columns, we can allocate
+// tv0 on the first 64 lanes, and tv1 on the next 64 lanes. Another example is,
+// if tv0 has 128 rows and 31 columns, and tv1 has 128 rows and 33 columns, we
+// pack the two tensors into a single tcgen05.alloc of 64 columns.
+//
+// 2. Latency. We should relinquish the right to allocate as soon as we are done
+// with allocating, so that other CTAs can grab the "right to allocate" mutex.
+// We should also deallocate the tensor memory as soon as we are done with using
+// it, so that other CTA's tcgen05.alloc can get unblocked. In a perfect world,
+// it would be nice to able to break one TensorView into multiple deallocations.
+// For example, if tv0 has 128 rows and 256 columns, and we are sequentially
+// reading these 256 columns one by one. For this case, instead of waiting for
+// the entire 256-size loop to finish, it would be nice to deallocate the first
+// 128 columns if we are done with reading them, so that other CTAs have a
+// chance to allocate their memory in the freed space.
+//
+// From the above analysis, it is important to realize that the allocation of
+// TensorView and the allocation of the tensor memory are not a one-to-one
+// correspondence. A TensorView can be allocated by multiple tcgen05.allocs, and
+// a tcgen05.alloc can be used to allocate multiple TensorViews.
+//
+// In practice, it is very difficult to optimize both fragmentation and latency
+// perfectly. Although tensor memory was originally designed for matmul, because
+// it is a large and fast memory, it would be nice to use it for other purposes,
+// such as persistent buffers. This could make it even more difficult to
+// allocate tensor memory optimally. Considering the complexity of the problem,
+// the development of a tensor memory allocator is likely an incremental
+// process. With this in mind, we design the allocation of tensor memory in
+// nvFuser to be hackable.
+//
+// There are three main components in the design:
+// 1. A data structure, TMemAllocationInfo, that describes how we allocate
+//    tensor memory.
+// 2. A heuristic, executed as part of computeTMemInfo, that generates the
+//    allocation information as an instance of TMemAlllocationInfo.
+// 3. A pass, executed as part of insertAllocations, that generates the actual
+//    IR nodes based on the TMemAlllocationInfo.
+//
+// The TMemAllocationInfo data structure and the insertAllocations support
+// a wider range of allocation strategies than the heuristic in computeTMemInfo.
+// This provides some flexibility for prototyping and experimentation by just
+// manually specifying TMemAllocationInfo.
+
+// The data structure that describes how we allocate tensor memory. It is
+// assumed that:
+// 1. TMem allocation are split into regions, with each region described by a
+//    Region. Each region spans a full 128 lanes and N columns of tensor memory.
+//    The number of columns must be a power of two and minimum 32. Each region
+//    is allocated by a single tcgen05.alloc and deallocated by a matching
+//    tcgen05.dealloc.
+// 2. Each kernel can have multiple regions.
+// 3. Each region can cover multiple TensorViews, but each TensorView can not
+//    span multiple regions.
+struct TMemAlllocationInfo {
+  // Each entry describes a region of 128 rows x N columns of tensor memory
+  // allocated by a single tcgen05.alloc.
+  struct Region {
+    // tcgen05.alloc stores the allocated address in shared memory. So we use a
+    // TensorView with MemoryType::Shared to store this address.
+    TensorView* address;
+    // The number of columns to allocate. Must be >= 32 and a power of two.
+    Val* num_columns;
+    // The TMem TensorViews covered by this region. Each region can be used to
+    // store multiple TensorViews. The (lane_offset, column_offset) specifies
+    // the starting offset of each TensorView in this region.
+    struct TVInfo {
+      TensorView* tensor;
+      Val* lane_offset;
+      Val* column_offset;
+    };
+    std::vector<TVInfo> covered_tensors;
+  };
+  std::vector<Region> regions;
+};
+
+// The actual definition of TensorMemoryInfo.
+struct TensorMemoryInfo {
+  TMemAlllocationInfo allocation;
+};
+
+} // namespace nvfuser
diff --git a/csrc/device_lower/lower2device.cpp b/csrc/device_lower/lower2device.cpp
@@ -598,6 +598,9 @@ void GpuLower::analysis(Fusion* fusion) {
 
   consumerToTMAInfo() = getConsumerToTMAInfoMap(fusion_);
   dumpExprsIfEnabled(fusion_->exprs(), "getConsumerToTMAInfoMap");
+
+  tmemInfo() = computeTMemInfo(fusion_);
+  dumpExprsIfEnabled(fusion_->exprs(), "computeTMemInfo");
 }
 
 kir::Kernel* GpuLower::kernel() const {

diff --git a/csrc/device_lower/lower2device.h b/csrc/device_lower/lower2device.h
@@ -14,6 +14,7 @@
 #include <device_lower/analysis/fused_reduction.h>
 #include <device_lower/analysis/predicate_elimination.h>
 #include <device_lower/analysis/sync_information.h>
+#include <device_lower/analysis/tensor_memory.h>
 #include <device_lower/analysis/thread_predicate.h>
 #include <device_lower/analysis/tma.h>
 #include <device_lower/analysis/trivial_broadcast.h>
@@ -268,6 +269,14 @@ class GpuLower : public NonCopyable {
     return consumer_to_tma_info_;
   }
 
+  const TensorMemoryInfo& tmemInfo() const {
+    return tmem_info_;
+  }
+
+  TensorMemoryInfo& tmemInfo() {
+    return tmem_info_;
+  }
+
   // Register a boolean Val as a predicate to validate at the run time. Optional
   // validation error messages can be given as args.
   template <typename... Args>
@@ -365,6 +374,9 @@ class GpuLower : public NonCopyable {
   // Keep track of the mbarrier used for each load/store operation
   std::unordered_map<const Expr*, TensorView*> ldst_mbarrier_map_;
 
+  // Information about tensor memory usage
+  TensorMemoryInfo tmem_info_;
+
   // Keep track of validations needed at runtime. For example, a pair of
   //! "extent mod split_factor == 0" and an error message for divisibility check
   //! for vectorization.

diff --git a/csrc/device_lower/pass/alias_memory.cpp b/csrc/device_lower/pass/alias_memory.cpp
@@ -2107,7 +2107,7 @@ std::vector<Expr*> reuseMemoryAllocations(const std::vector<Expr*>& exprs) {
   // downstream expressions. Rather than try to keep those in sync, we just
   // recompute the allocation info map here.
   if (inserted_syncs) {
-    allocation_info_map = AllocationInfoMap(synced_exprs, false);
+    allocation_info_map = AllocationInfoMap(synced_exprs, true);
   }
 
   assignSharedMemoryAllocations(synced_exprs, allocation_info_map);