Skip to content

Commit

Permalink
Merge branch 'main' into fp8_enable_on_sm89
Browse files Browse the repository at this point in the history
  • Loading branch information
jjsjann123 authored Dec 20, 2024
2 parents 44e949b + 49b0862 commit 48e620f
Show file tree
Hide file tree
Showing 6 changed files with 546 additions and 49 deletions.
41 changes: 10 additions & 31 deletions csrc/scheduler/resize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,40 +71,19 @@ bool ResizeScheduler::canScheduleCompileTime(Fusion* fusion) {
IdModel id_model(fusion, /*build_graphs=*/false);
const auto& broadcast_graph = id_model.buildBroadcastGraph();

// For now, only a single resize op is allowed to exist.
auto resize_based_tensor_ops = ir_utils::getOpsOfType<SliceOp, PadOp>(fusion);
if (resize_based_tensor_ops.size() != 1) {
scheduler_debug_utils::canScheduleRejectReason(
schedulerType(), "Only a single resize op is allowed.");
return false;
}

auto resize_out_tv =
resize_based_tensor_ops.at(0)->output(0)->as<TensorView>();

auto all_dep_vals = DependencyCheck::getAllValsBetween(
{fusion->inputs().begin(), fusion->inputs().end()}, {resize_out_tv});
for (auto tv : ir_utils::filterByType<TensorView>(all_dep_vals)) {
if (tv == resize_out_tv) {
continue;
}
if (tv->isFusionOutput()) {
scheduler_debug_utils::canScheduleRejectReason(
schedulerType(),
"Dependency to fusion output not allowed: ",
tv->toString());
return false;
}
for (auto consumer_of_tv : ir_utils::consumerTvsOf(tv)) {
if (std::find(all_dep_vals.begin(), all_dep_vals.end(), consumer_of_tv) ==
all_dep_vals.end()) {
scheduler_debug_utils::canScheduleRejectReason(
schedulerType(),
"Resize inputs must be exclusively consumed by resize: ",
consumer_of_tv->toString());
return false;
}
if (auto non_exclusive_resizes = scheduler_tools::getNonExclusiveResizeInfo(
resize_based_tensor_ops, id_model.idGraph(IdMappingMode::EXACT));
!non_exclusive_resizes.empty()) {
std::stringstream msg;
msg << "Propagation of resizes would affect fusion outputs.";
for (const auto& [tv, resize_ids] : non_exclusive_resizes) {
msg << " Resize input tv: " << tv->toString()
<< ", resize input ID groups: " << nvfuser::toString(resize_ids);
}
scheduler_debug_utils::canScheduleRejectReason(schedulerType(), msg.str());
return false;
}

// Slicing of or to a broadcast ID is not allowed yet.
Expand Down
106 changes: 106 additions & 0 deletions csrc/scheduler/tools/resize_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,5 +66,111 @@ void propagateResizeToInputs(Expr* resize_tensor_op) {
}
}

std::unordered_map<TensorView*, ValGroups> getNonExclusiveResizeInfo(
const std::vector<Expr*>& ordered_resize_tensor_ops,
const ValGraph& exact_graph) {
NVF_ERROR(!ordered_resize_tensor_ops.empty());
Fusion* fusion = ordered_resize_tensor_ops[0]->fusion();

std::unordered_map<TensorView*, ValGroups> non_exclusive_resizes;

std::unordered_set<Val*> inputs{
fusion->inputs().begin(), fusion->inputs().end()};

auto get_root_to_logical_resizes =
[&exact_graph](TensorView* tv) -> ValGroups {
// This should be only used for outputs of resize-based ops,
// so it should always have a root domain.
NVF_ERROR(tv->hasRoot());
auto out_tv_root_to_logical_exprs = DependencyCheck::getAllExprsBetween(
{tv->getRootDomain().begin(), tv->getRootDomain().end()},
{tv->getLogicalDomain().begin(), tv->getLogicalDomain().end()});
ValGroups resize_inp_ids;
for (auto resize :
ir_utils::filterByType<Resize>(out_tv_root_to_logical_exprs)) {
resize_inp_ids.pushBack(exact_graph.toGroup(resize->in()));
}
return resize_inp_ids;
};

// Traverse the ops in a topological order
for (Expr* resize_tensor_op : ordered_resize_tensor_ops) {
auto inp_tv = dynamic_cast<TensorView*>(resize_tensor_op->inputs().at(0));
auto out_tv = dynamic_cast<TensorView*>(resize_tensor_op->outputs().at(0));

ValGroups resize_inp_ids = get_root_to_logical_resizes(out_tv);
NVF_ERROR(!resize_inp_ids.empty());

auto dep_vals =
DependencyCheck::getAllValsBetween(inputs, std::vector<Val*>{inp_tv});

// For each tensor that inp_tv depends on, check if the resize op
// is considered non-exclusive with respect to the tensor. That
// is, if propagation of the resize may result in externally
// visible changes through the tensor, the resize is considered
// non-exclusive.
for (auto dep_tv : ir_utils::filterByType<TensorView>(dep_vals)) {
bool maybe_non_exclusive = dep_tv->isFusionOutput();

if (!maybe_non_exclusive) {
// If a dependent tv has a consumer that inp_tv does not
// depend on, propagation of resize would escape to outputs,
// which needs to be avoided.
for (auto consumer_tv : ir_utils::consumerTvsOf(dep_tv)) {
// We are interested in if resized IDs are used by other tensors
// than out_tv
if (consumer_tv != out_tv &&
std::find(dep_vals.begin(), dep_vals.end(), consumer_tv) ==
dep_vals.end()) {
maybe_non_exclusive = true;
break;
}
}
}

if (!maybe_non_exclusive) {
continue;
}

// dep_tv potentially is either a fusion output or it has a
// consumer outside of the dependency set to the resized
// tensor. Propagating the resize to dep_tv should be
// avoided. However, if the dep_tv iter domain that corresponds
// to the resized ID is a broadcast or there's no such ID, it
// should still be safe to consider the resize op exclusive as
// there's no iter domain to resize. For a concrete example, see
// ResizeSchedulerTest.PropagateMultipleSlicesToInputs4.
const auto inp_tv_logical_groups =
exact_graph.toGroups(inp_tv->getLogicalDomain());
const auto dep_tv_logical_groups =
exact_graph.toGroups(dep_tv->getLogicalDomain());
auto vals_between = getValsBetween<ValGraphBFS>(
{inp_tv_logical_groups.begin(), inp_tv_logical_groups.end()},
{dep_tv_logical_groups.begin(), dep_tv_logical_groups.end()},
exact_graph);

for (const ValGroup& resize_inp_id : resize_inp_ids) {
if (std::find(
vals_between.begin(), vals_between.end(), resize_inp_id) ==
vals_between.end()) {
// This resize can be ignored as there's no corresponding ID
// in the dep tv
continue;
}

// This resize input ID is not exclusively used
non_exclusive_resizes[inp_tv].pushBack(resize_inp_id);
}
}

// Analysis of exclusiveness until in_tv is done. Following
// resize-based tensor ops do not need to check the same section
// of the fusion and can start from out_tv.
inputs.insert(out_tv);
}

return non_exclusive_resizes;
}

} // namespace scheduler_tools
} // namespace nvfuser
79 changes: 79 additions & 0 deletions csrc/scheduler/tools/resize_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,12 @@
// clang-format on
#pragma once

#include <val_graph.h>

namespace nvfuser {

class Expr;
class TensorView;

namespace scheduler_tools {

Expand All @@ -19,5 +22,81 @@ namespace scheduler_tools {
// fusion inputs are skipped as their loop domains don't matter.
void propagateResizeToInputs(Expr* resize_op);

// Given a topologically ordered list of resize-based tensor ops such
// as slice and pad, check if they can be propagated to fusion inputs
// exclusively without causing any visible side effect. For example,
// if a tensor is sliced and also is used to produce an output without
// the slicing, the slice is considered non exclusive as the slice
// input has the other visible consumer. Propagating the resize of the
// slice to the slice input is invalid since the output computed from
// the slice input depends on the full iteration space.
//
// For example, consider the following case:
//
// t0 = makeSymbolicTensor(1)
// fusion.addInput(t0)
// t1 = t0 + 1
// t2 = t1[1:10]
// t3 = t1 + 1
// fusion.addOutput(t2)
// fusion.addOutput(t3)
//
// In this case, propating the resize op of the slice would alter t1,
// which would in turn affect t3, which is a fusion output. Since the
// change would be visible due to the change of t3, this resize op is
// considered non-exclusive.
//
// Consider a slightly different case as shown below:
//
// t0 = makeSymbolicTensor(1)
// fusion.addInput(t0)
// t1 = t0[1:10]
// t2 = t0 + 1
// fusion.addOutput(t1)
// fusion.addOutput(t2)
//
// Note that the slice is directly done with the fusion input. Since
// we do not propagate resize ops to fusion inputs, this can be
// considered exclusive. However, this is also considered
// non-exclusive since the actual scheduling inserts a cache after t0,
// which can cause a visible side effect if the resize is propagated.
//
// Another non-exclusivness comes from dependent fusion outputs. For
// example, if a slice input depends on a fusion output, propagation
// would alter the fusion output. Consider a case like:
//
// t0 = makeSymbolicTensor(1)
// fusion.addInput(t0)
// t1 = t0 + 1
// t2 = t1[1:10] // slice
// fusion.addOutput(t1)
// fusion.addOutput(t2)
//
// If the resize op for the slice is propagated to t1, only the
// section of [1:10] would be computed. Since that would change a
// fusion output, the resize op is considered non-exclusive.
//
// When there's a chain of resize-based ops, for example:
//
// t0 = makeSymbolicTensor(1)
// fusion.addInput(t0)
// t1 = t0 + 1
// t2 = t1[1:10]
// t3 = t2[2:5]
// t4 = t1 + 1
// fusion.addOutput(t3)
// fusion.addOutput(t4)
//
// We do not consider the second slice as non-exclusive as
// long as the first slice is considered non-exclusive. This will be
// important when resolving the non-exclusiveness by replication.
//
// The function returns a map from tensors that are input to
// non-exclusive ops to their resize input ID groups. This map will be
// used to resolve the non-exclusiveness by replication.
std::unordered_map<TensorView*, ValGroups> getNonExclusiveResizeInfo(
const std::vector<Expr*>& ordered_resize_tensor_ops,
const ValGraph& exact_graph);

} // namespace scheduler_tools
} // namespace nvfuser
2 changes: 0 additions & 2 deletions tests/cpp/test_gpu3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9249,8 +9249,6 @@ TEST_F(NVFuserTest, AllIdsMultipleDependencies) {
tv1->split(0, 4);
tv1->split(0, 8);

fusion.print();

auto all_ids = tv1->domain()->allIDs();

auto split2 = tv1->axis(0)->definition()->as<Split>();
Expand Down
6 changes: 3 additions & 3 deletions tests/cpp/test_matmul_scheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1060,7 +1060,7 @@ TEST_F(MatmulSchedulerTest, FusedMultiplySumOnly) {
// for Ampere with strict ref check, hence single layout check
TEST_F(MatmulSchedulerTest, BasicMatmulStrictCheckTT) {
// TODO: Make these tests work with Hopper as well as Ampere
NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(8, 0, 8, 9);
NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(8, 0, 9, 0);

const int M = 128, N = 256, K = 512;
const auto layout = MmaLayout::TT;
Expand Down Expand Up @@ -2481,7 +2481,7 @@ class MatmulSchedulerPluginTest : public NVFuserTest {

// Test that our fake plugin works to override the default heuristic
TEST_F(MatmulSchedulerPluginTest, BasicMatmul) {
NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(8, 0, 8, 9);
NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(8, 0, 9, 0);
const int M = 128, N = 256, K = 512;
const auto layout = MmaLayout::TT;
auto fusion = std::make_unique<Fusion>();
Expand Down Expand Up @@ -3156,7 +3156,7 @@ INSTANTIATE_TEST_SUITE_P(
#undef NVFUSER_TEST_CUDA_ARCH_GUARD

TEST_F(MatmulSchedulerTest, OperandOrderIssue2434) {
NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(8, 0, 8, 9);
NVFUSER_TEST_CUDA_ARCH_RANGE_GUARD(8, 0, 9, 0);
int M = 32, N = 64, K = 128;

std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
Expand Down
Loading

0 comments on commit 48e620f

Please sign in to comment.