Skip to content

Commit a5022da

Browse files
authored
Fix the legacy loop indexing traversal (#3373)
This is a temporary WAR for #3374. It's temporary since the repro has no problem with the IdModel-based indexer. This is for unblocking @IvanYashchuk until we can make the new indexer enabled by default. The root cause of the issue is when we attempt to find a correct indexing path from the loop domain to the allocation domain of the indexed tensor, the algorithm fails to find a path visiting a backward merge when the indexed tensor has only one of the inputs. That happens when the tensor is broadcast and gets inlined with broadcast forwarding. In the current code, in that case, it just picks the first traversal option, which I think happens to be working fine, but that's not necessarily the right chose, particularly because we are looking at all candidate next traversal targets that are permissively mapped. The WAR is simply picking a candidate as long as it has at least one mapped ID. I think this would be good enough as a temporary WAR. Fixes #3374
1 parent 38f7152 commit a5022da

File tree

2 files changed

+90
-0
lines changed

2 files changed

+90
-0
lines changed

csrc/device_lower/analysis/index_compute.cpp

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1372,6 +1372,10 @@ std::unordered_set<IterDomain*> buildLoopIndexingPreferredPath(
13721372
// multiple such IDs exist, select one whose input IDs are mapped with
13731373
// the consumer IDs. This is to ensure the path from the loop
13741374
// IterDomains to the root matches with the consumer tensor.
1375+
// Additionally, when none of the candidate iter domain has all of its
1376+
// inputs mapped with the consumer tensor, prefer one that has at
1377+
// least one mapped. This matters when the consumer tensor only has
1378+
// one of the merge inputs, for example.
13751379
IterDomain* getLogicalIDToTraverse(
13761380
IterDomain* id,
13771381
const std::vector<Val*>& consumer_all_ids) {
@@ -1382,6 +1386,9 @@ IterDomain* getLogicalIDToTraverse(
13821386
return nullptr;
13831387
}
13841388

1389+
// Keep track of an iter domain that has at least one input mapped.
1390+
IterDomain* fallback_candidate = nullptr;
1391+
13851392
for (auto logical_id : logical_ids) {
13861393
auto def = logical_id->definition();
13871394
if (def == nullptr) {
@@ -1398,6 +1405,22 @@ IterDomain* getLogicalIDToTraverse(
13981405
})) {
13991406
return logical_id;
14001407
}
1408+
1409+
if (std::any_of(
1410+
logical_id_inputs.begin(),
1411+
logical_id_inputs.end(),
1412+
[&](IterDomain* logical_id_input) {
1413+
return isPermissivelyMappedWithAny(
1414+
logical_id_input, consumer_all_ids);
1415+
})) {
1416+
if (fallback_candidate == nullptr) {
1417+
fallback_candidate = logical_id;
1418+
}
1419+
}
1420+
}
1421+
1422+
if (fallback_candidate != nullptr) {
1423+
return fallback_candidate;
14011424
}
14021425

14031426
// No mapped ID found, which means the consumer is a post-view

tests/cpp/test_indexing.cpp

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5172,4 +5172,71 @@ TEST_F(IndexingTest, PerDimLogicalIndices) {
51725172
lower.run();
51735173
}
51745174

5175+
// Repro of issue #3374
5176+
// (https://github.com/NVIDIA/Fuser/issues/3374). Previously failed
5177+
// with an error message of:
5178+
// Couldn't find allocation mapping for T14_l_float[ iblockIdx.x269{(
5179+
// ceilDiv(2, blockDim.x) )}, ithreadIdx.x270{blockDim.x}, iS278{(
5180+
// ceilDiv(( ceilDiv(( ceilDiv(( ceilDiv(32768, blockDim.y) ), 8) ),
5181+
// 1) ), gridDim.y) )}, iblockIdx.y277{gridDim.y},
5182+
// ithreadIdx.y272{blockDim.y}, iUS276{1}, iUR274{8} ] ca_pos( 6 )
5183+
// dim: 1 id: iS57{2}
5184+
TEST_F(IndexingTest, Issue3374) {
5185+
auto fusion_ptr = std::make_unique<Fusion>();
5186+
auto& fusion = *fusion_ptr;
5187+
FusionGuard fg(fusion_ptr.get());
5188+
5189+
std::vector<int64_t> shape1{28, 32768, 2};
5190+
std::vector<int64_t> shape2{32768, 2};
5191+
std::vector<int64_t> shape3{28, 32768, 1};
5192+
std::vector<int64_t> shape4{32768, 56};
5193+
5194+
auto tv0 =
5195+
TensorViewBuilder().shape(shape1).contiguity({true, false, true}).build();
5196+
fusion.addInput(tv0);
5197+
auto tv1 = TensorViewBuilder().shape(shape2).contiguity({true, true}).build();
5198+
fusion.addInput(tv1);
5199+
auto tv2 = TensorViewBuilder()
5200+
.shape(shape3)
5201+
.contiguity({true, false, std::nullopt})
5202+
.build();
5203+
fusion.addInput(tv2);
5204+
auto tv3 = TensorViewBuilder()
5205+
.shape(shape3)
5206+
.contiguity({true, false, std::nullopt})
5207+
.build();
5208+
fusion.addInput(tv3);
5209+
5210+
auto tv4 = pad(tv2, {fusion.oneVal(), fusion.zeroVal()});
5211+
auto tv5 = pad(tv3, {fusion.zeroVal(), fusion.oneVal()});
5212+
auto tv6 = add(tv4, tv5);
5213+
auto tv7 = broadcast(tv1, {true, false, false});
5214+
auto tv8 = mul(tv7, tv0);
5215+
auto tv9 = add(tv6, tv8);
5216+
auto tv10 = permute(tv9, {1, 0, 2});
5217+
std::vector<Val*> reshape_shape;
5218+
std::transform(
5219+
shape4.begin(),
5220+
shape4.end(),
5221+
std::back_inserter(reshape_shape),
5222+
[](int64_t s) { return IrBuilder::create<Val>(s, DataType::Index); });
5223+
auto tv11 = reshape(tv10, reshape_shape);
5224+
auto tv12 = sum(tv11, {0});
5225+
fusion.addOutput(tv12);
5226+
fusion.addOutput(tv11);
5227+
fusion.addOutput(tv7);
5228+
5229+
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
5230+
auto t0 = at::randn(shape1, options);
5231+
auto t1 = at::randn(shape2, options);
5232+
auto t2 = at::randn(shape3, options);
5233+
auto t3 = at::randn(shape3, options);
5234+
std::vector<c10::IValue> inputs{t0, t1, t2, t3};
5235+
5236+
FusionExecutorCache executor_cache(std::move(fusion_ptr));
5237+
auto outputs = executor_cache.runFusionWithInputs(inputs);
5238+
5239+
testValidate(executor_cache.fusion(), outputs, inputs, __LINE__, __FILE__);
5240+
}
5241+
51755242
} // namespace nvfuser

0 commit comments

Comments
 (0)