Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tensor memory support 1 #3755

Closed
wants to merge 25 commits into from
Closed
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ list(APPEND NVFUSER_SRCS
${NVFUSER_SRCS_DIR}/device_lower/analysis/index_compute.cpp
${NVFUSER_SRCS_DIR}/device_lower/analysis/predicate_elimination.cpp
${NVFUSER_SRCS_DIR}/device_lower/analysis/sync_information.cpp
${NVFUSER_SRCS_DIR}/device_lower/analysis/tensor_memory.cpp
${NVFUSER_SRCS_DIR}/device_lower/analysis/thread_predicate.cpp
${NVFUSER_SRCS_DIR}/device_lower/analysis/tma.cpp
${NVFUSER_SRCS_DIR}/device_lower/analysis/trivial_broadcast.cpp
Expand Down Expand Up @@ -826,6 +827,7 @@ list(APPEND NVFUSER_RUNTIME_FILES
${NVFUSER_ROOT}/runtime/mbarrier.cu
${NVFUSER_ROOT}/runtime/memory.cu
${NVFUSER_ROOT}/runtime/random_numbers.cu
${NVFUSER_ROOT}/runtime/tensor_memory.cu
${NVFUSER_ROOT}/runtime/tensor.cu
${NVFUSER_ROOT}/runtime/tuple.cu
${NVFUSER_ROOT}/runtime/type_traits.cu
Expand Down
18 changes: 16 additions & 2 deletions csrc/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -615,7 +615,7 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
value);
auto atype = std::get<ArrayType>(dtype.type);
auto dims = static_cast<int64_t>(value.as<std::vector>().size());
code_ << "{ ";
code_ << "{";
for (auto i = 0; i < dims; i++) {
if (i > 0) {
code_ << ", ";
Expand Down Expand Up @@ -681,6 +681,12 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
return;
}

if (ti->view()->getMemoryType() == MemoryType::Tensor) {
code_ << "(uint32_t)(" << genVariableName(ti->view()) << " + "
<< genInline(ti->index()) << ")";
return;
}

if (ti->view()->getMemoryType() == MemoryType::Global &&
kernel_->summary().sync_map->needsRawSync(ti->view()).hasBID()) {
code_ << "*(volatile " << ti->getDataType().value() << "*)&";
Expand Down Expand Up @@ -3178,7 +3184,15 @@ class CudaKernelGenerator : private kir::ConstIrVisitor {
indent() << buffer_dtype << " " << genVariableName(tv) << "["
<< genInline(size) << "];\n";
}
} break;
break;
}
case MemoryType::Tensor: {
indent() << "TMemTensor " << genVariableName(tv) << "("
<< genInline(alloc->baseAddress()) << ", "
<< genInline(alloc->laneOffset()) << ", "
<< genInline(alloc->colOffset()) << ");\n";
break;
}
default:
NVF_THROW("Unexpected memory type");
}
Expand Down
10 changes: 5 additions & 5 deletions csrc/device_lower/analysis/predicate_elimination.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -571,11 +571,11 @@ class PredicateChcker : public IterVisitor {
// For details on zero loops, see indexMapFromTV in
// lower index pass.
std::vector<Val*> getZeroLeafIds(const TensorView* tv) const {
NVF_ERROR(
tv->getMemoryType() == MemoryType::Local ||
tv->getMemoryType() == MemoryType::Shared,
"Local or shared memory tensor is assumed: ",
tv->toString());
// NVF_ERROR(
// tv->getMemoryType() == MemoryType::Local ||
// tv->getMemoryType() == MemoryType::Shared,
// "Local or shared memory tensor is assumed: ",
// tv->toString());
bool is_shared_mem = tv->getMemoryType() == MemoryType::Shared;
std::vector<Val*> zero_loop_ids;
for (const auto i : c10::irange(tv->nDims())) {
Expand Down
50 changes: 50 additions & 0 deletions csrc/device_lower/analysis/tensor_memory.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on

#include <device_lower/analysis/tensor_memory.h>
#include <fusion.h>
#include <ir/all_nodes.h>
#include <type.h>

namespace nvfuser {

TensorMemoryInfo computeTMemInfo(Fusion* fusion) {
TensorMemoryInfo result;

// Compute the allocation information for tensor memory. Currently, we use a
// very simple heuristic that assign a separate region for each TensorView.
// See note [Tensor Memory Allocation] for the overall design.
auto& regions = result.allocation.regions;
for (auto tv : fusion->allTvs()) {
if (tv->getMemoryType() != MemoryType::Tensor) {
continue;
}
regions.emplace_back();
auto& region = regions.back();

region.address = TensorViewBuilder()
.shape(std::vector<Val*>{})
.dtype(DataType::UInt32)
.build();
region.address->setMemoryType(MemoryType::Shared);

// TODO: right now we hardcode the number of columns to be 32. This is
// definitely not correct.
region.num_columns = IrBuilder::create<Val>(32, DataType::UInt32);

region.covered_tensors.emplace_back();
auto& covered_tensor = region.covered_tensors.back();
covered_tensor.tensor = tv;
covered_tensor.lane_offset = tv->fusion()->zeroVal(DataType::UInt16);
covered_tensor.column_offset = tv->fusion()->zeroVal(DataType::UInt16);
}

return result;
}

} // namespace nvfuser
142 changes: 142 additions & 0 deletions csrc/device_lower/analysis/tensor_memory.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
#pragma once

#include <vector>

namespace nvfuser {

class Val;
class TensorView;
class Fusion;

// Information used to lower tensor memory. So far, it is just about allocation.
struct TensorMemoryInfo;
TensorMemoryInfo computeTMemInfo(Fusion* fusion);

// Note: [Tensor Memory Allocation]
//
// Tensor memory is a very special memory, so its allocation is also very
// different from other memory types.
//
// It is highly recommended to read the PTX documentation for tensor memory
// if you are not alreay familiar with it:
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#tensor-memory
//
// The first thing to note is, TMem does not have virtualization. This means:
// We can not just allocate starting from address 0 like how we allocate shared
// memory, and rely on page table to translate the same virtual address of
// different CTA to different physical address. There is no virtual TMem
// address. All addresses are physical addresses.
//
// Because multiple CTAs can execute on the same SM simultaneously, there must
// be some handshaking mechanism for each CTA to know the region of TMem that it
// can use. This is done by using the PTX instruction tcgen05.alloc.
//
// The tcgen05.alloc instruction is like the following:
// tcgen05.alloc [dest], nCols
//
// There are three important things to note about this instruction:
//
// 1. The output of this instruction is in shared memory address.
// 2. The unit of allocation is 32 whole columns of tensor memory. And nCols
// must be a power of two.
// 3. The right to allocate is like a mutex and will serialize CTA scheduling.
// The tcgen05.alloc is blocking when there is no space to allocate.
//
// The point 1 above is not a big trouble for us, but we need to make sure we
// allocate the address tensor in shared memory before allocating the tensor
// memory. But the point 2 and 3 can be a big challenge. There are basically
// two things to worry about when allocating tensor memory:
//
// 1. Fragmentation. When the tensor does not occupy all lanes or the tensor's
// size is not a power of two columns or < 32 columns, naively allocating all
// lanes with 32 or higher power of 2 columns could waste some space. In a
// perfect world, it would be nice to have a 2D allocator that is capable
// merging the allocation of multiple tensors into a single tcgen05.alloc.
// For example, if tv0 and tv2 both has 64 rows and 32 columns, we can allocate
// tv0 on the first 64 lanes, and tv1 on the next 64 lanes. Another example is,
// if tv0 has 128 rows and 31 columns, and tv1 has 128 rows and 33 columns, we
// pack the two tensors into a single tcgen05.alloc of 64 columns.
//
// 2. Latency. We should relinquish the right to allocate as soon as we are done
// with allocating, so that other CTAs can grab the "right to allocate" mutex.
// We should also deallocate the tensor memory as soon as we are done with using
// it, so that other CTA's tcgen05.alloc can get unblocked. In a perfect world,
// it would be nice to able to break one TensorView into multiple deallocations.
// For example, if tv0 has 128 rows and 256 columns, and we are sequentially
// reading these 256 columns one by one. For this case, instead of waiting for
// the entire 256-size loop to finish, it would be nice to deallocate the first
// 128 columns if we are done with reading them, so that other CTAs have a
// chance to allocate their memory in the freed space.
//
// From the above analysis, it is important to realize that the allocation of
// TensorView and the allocation of the tensor memory are not a one-to-one
// correspondence. A TensorView can be allocated by multiple tcgen05.allocs, and
// a tcgen05.alloc can be used to allocate multiple TensorViews.
//
// In practice, it is very difficult to optimize both fragmentation and latency
// perfectly. Although tensor memory was originally designed for matmul, because
// it is a large and fast memory, it would be nice to use it for other purposes,
// such as persistent buffers. This could make it even more difficult to
// allocate tensor memory optimally. Considering the complexity of the problem,
// the development of a tensor memory allocator is likely an incremental
// process. With this in mind, we design the allocation of tensor memory in
// nvFuser to be hackable.
//
// There are three main components in the design:
// 1. A data structure, TMemAllocationInfo, that describes how we allocate
// tensor memory.
// 2. A heuristic, executed as part of computeTMemInfo, that generates the
// allocation information as an instance of TMemAlllocationInfo.
// 3. A pass, executed as part of insertAllocations, that generates the actual
// IR nodes based on the TMemAlllocationInfo.
//
// The TMemAllocationInfo data structure and the insertAllocations support
// a wider range of allocation strategies than the heuristic in computeTMemInfo.
// This provides some flexibility for prototyping and experimentation by just
// manually specifying TMemAllocationInfo.

// The data structure that describes how we allocate tensor memory. It is
// assumed that:
// 1. TMem allocation are split into regions, with each region described by a
// Region. Each region spans a full 128 lanes and N columns of tensor memory.
// The number of columns must be a power of two and minimum 32. Each region
// is allocated by a single tcgen05.alloc and deallocated by a matching
// tcgen05.dealloc.
// 2. Each kernel can have multiple regions.
// 3. Each region can cover multiple TensorViews, but each TensorView can not
// span multiple regions.
struct TMemAlllocationInfo {
// Each entry describes a region of 128 rows x N columns of tensor memory
// allocated by a single tcgen05.alloc.
struct Region {
// tcgen05.alloc stores the allocated address in shared memory. So we use a
// TensorView with MemoryType::Shared to store this address.
TensorView* address;
// The number of columns to allocate. Must be >= 32 and a power of two.
Val* num_columns;
// The TMem TensorViews covered by this region. Each region can be used to
// store multiple TensorViews. The (lane_offset, column_offset) specifies
// the starting offset of each TensorView in this region.
struct TVInfo {
TensorView* tensor;
Val* lane_offset;
Val* column_offset;
};
std::vector<TVInfo> covered_tensors;
};
std::vector<Region> regions;
};

// The actual definition of TensorMemoryInfo.
struct TensorMemoryInfo {
TMemAlllocationInfo allocation;
};

} // namespace nvfuser
3 changes: 3 additions & 0 deletions csrc/device_lower/lower2device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,9 @@ void GpuLower::analysis(Fusion* fusion) {

consumerToTMAInfo() = getConsumerToTMAInfoMap(fusion_);
dumpExprsIfEnabled(fusion_->exprs(), "getConsumerToTMAInfoMap");

tmemInfo() = computeTMemInfo(fusion_);
dumpExprsIfEnabled(fusion_->exprs(), "computeTMemInfo");
}

kir::Kernel* GpuLower::kernel() const {
Expand Down
12 changes: 12 additions & 0 deletions csrc/device_lower/lower2device.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include <device_lower/analysis/fused_reduction.h>
#include <device_lower/analysis/predicate_elimination.h>
#include <device_lower/analysis/sync_information.h>
#include <device_lower/analysis/tensor_memory.h>
#include <device_lower/analysis/thread_predicate.h>
#include <device_lower/analysis/tma.h>
#include <device_lower/analysis/trivial_broadcast.h>
Expand Down Expand Up @@ -268,6 +269,14 @@ class GpuLower : public NonCopyable {
return consumer_to_tma_info_;
}

const TensorMemoryInfo& tmemInfo() const {
return tmem_info_;
}

TensorMemoryInfo& tmemInfo() {
return tmem_info_;
}

// Register a boolean Val as a predicate to validate at the run time. Optional
// validation error messages can be given as args.
template <typename... Args>
Expand Down Expand Up @@ -365,6 +374,9 @@ class GpuLower : public NonCopyable {
// Keep track of the mbarrier used for each load/store operation
std::unordered_map<const Expr*, TensorView*> ldst_mbarrier_map_;

// Information about tensor memory usage
TensorMemoryInfo tmem_info_;

// Keep track of validations needed at runtime. For example, a pair of
//! "extent mod split_factor == 0" and an error message for divisibility check
//! for vectorization.
Expand Down
2 changes: 1 addition & 1 deletion csrc/device_lower/pass/alias_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2107,7 +2107,7 @@ std::vector<Expr*> reuseMemoryAllocations(const std::vector<Expr*>& exprs) {
// downstream expressions. Rather than try to keep those in sync, we just
// recompute the allocation info map here.
if (inserted_syncs) {
allocation_info_map = AllocationInfoMap(synced_exprs, false);
allocation_info_map = AllocationInfoMap(synced_exprs, true);
}

assignSharedMemoryAllocations(synced_exprs, allocation_info_map);
Expand Down
Loading