Fix the legacy loop indexing traversal (#3373)

naoyam · web-flow · commit a5022da00e2e · 2024-11-12T10:21:29.000-08:00
This is a temporary WAR for #3374. It's temporary since the repro has no problem with the IdModel-based indexer. This is for unblocking @IvanYashchuk until we can make the new indexer enabled by default. The root cause of the issue is when we attempt to find a correct indexing path from the loop domain to the allocation domain of the indexed tensor, the algorithm fails to find a path visiting a backward merge when the indexed tensor has only one of the inputs. That happens when the tensor is broadcast and gets inlined with broadcast forwarding. In the current code, in that case, it just picks the first traversal option, which I think happens to be working fine, but that's not necessarily the right chose, particularly because we are looking at all candidate next traversal targets that are permissively mapped. The WAR is simply picking a candidate as long as it has at least one mapped ID. I think this would be good enough as a temporary WAR. Fixes #3374
diff --git a/csrc/device_lower/analysis/index_compute.cpp b/csrc/device_lower/analysis/index_compute.cpp
@@ -1372,6 +1372,10 @@ std::unordered_set<IterDomain*> buildLoopIndexingPreferredPath(
 // multiple such IDs exist, select one whose input IDs are mapped with
 // the consumer IDs. This is to ensure the path from the loop
 // IterDomains to the root matches with the consumer tensor.
+// Additionally, when none of the candidate iter domain has all of its
+// inputs mapped with the consumer tensor, prefer one that has at
+// least one mapped. This matters when the consumer tensor only has
+// one of the merge inputs, for example.
 IterDomain* getLogicalIDToTraverse(
     IterDomain* id,
     const std::vector<Val*>& consumer_all_ids) {
@@ -1382,6 +1386,9 @@ IterDomain* getLogicalIDToTraverse(
     return nullptr;
   }
 
+  // Keep track of an iter domain that has at least one input mapped.
+  IterDomain* fallback_candidate = nullptr;
+
   for (auto logical_id : logical_ids) {
     auto def = logical_id->definition();
     if (def == nullptr) {
@@ -1398,6 +1405,22 @@ IterDomain* getLogicalIDToTraverse(
             })) {
       return logical_id;
     }
+
+    if (std::any_of(
+            logical_id_inputs.begin(),
+            logical_id_inputs.end(),
+            [&](IterDomain* logical_id_input) {
+              return isPermissivelyMappedWithAny(
+                  logical_id_input, consumer_all_ids);
+            })) {
+      if (fallback_candidate == nullptr) {
+        fallback_candidate = logical_id;
+      }
+    }
+  }
+
+  if (fallback_candidate != nullptr) {
+    return fallback_candidate;
   }
 
   // No mapped ID found, which means the consumer is a post-view
diff --git a/tests/cpp/test_indexing.cpp b/tests/cpp/test_indexing.cpp
@@ -5172,4 +5172,71 @@ TEST_F(IndexingTest, PerDimLogicalIndices) {
   lower.run();
 }
 
+// Repro of issue #3374
+// (https://github.com/NVIDIA/Fuser/issues/3374). Previously failed
+// with an error message of:
+// Couldn't find allocation mapping for T14_l_float[ iblockIdx.x269{(
+// ceilDiv(2, blockDim.x) )}, ithreadIdx.x270{blockDim.x}, iS278{(
+// ceilDiv(( ceilDiv(( ceilDiv(( ceilDiv(32768, blockDim.y) ), 8) ),
+// 1) ), gridDim.y) )}, iblockIdx.y277{gridDim.y},
+// ithreadIdx.y272{blockDim.y}, iUS276{1}, iUR274{8} ] ca_pos( 6 )
+// dim: 1 id: iS57{2}
+TEST_F(IndexingTest, Issue3374) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto& fusion = *fusion_ptr;
+  FusionGuard fg(fusion_ptr.get());
+
+  std::vector<int64_t> shape1{28, 32768, 2};
+  std::vector<int64_t> shape2{32768, 2};
+  std::vector<int64_t> shape3{28, 32768, 1};
+  std::vector<int64_t> shape4{32768, 56};
+
+  auto tv0 =
+      TensorViewBuilder().shape(shape1).contiguity({true, false, true}).build();
+  fusion.addInput(tv0);
+  auto tv1 = TensorViewBuilder().shape(shape2).contiguity({true, true}).build();
+  fusion.addInput(tv1);
+  auto tv2 = TensorViewBuilder()
+                 .shape(shape3)
+                 .contiguity({true, false, std::nullopt})
+                 .build();
+  fusion.addInput(tv2);
+  auto tv3 = TensorViewBuilder()
+                 .shape(shape3)
+                 .contiguity({true, false, std::nullopt})
+                 .build();
+  fusion.addInput(tv3);
+
+  auto tv4 = pad(tv2, {fusion.oneVal(), fusion.zeroVal()});
+  auto tv5 = pad(tv3, {fusion.zeroVal(), fusion.oneVal()});
+  auto tv6 = add(tv4, tv5);
+  auto tv7 = broadcast(tv1, {true, false, false});
+  auto tv8 = mul(tv7, tv0);
+  auto tv9 = add(tv6, tv8);
+  auto tv10 = permute(tv9, {1, 0, 2});
+  std::vector<Val*> reshape_shape;
+  std::transform(
+      shape4.begin(),
+      shape4.end(),
+      std::back_inserter(reshape_shape),
+      [](int64_t s) { return IrBuilder::create<Val>(s, DataType::Index); });
+  auto tv11 = reshape(tv10, reshape_shape);
+  auto tv12 = sum(tv11, {0});
+  fusion.addOutput(tv12);
+  fusion.addOutput(tv11);
+  fusion.addOutput(tv7);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn(shape1, options);
+  auto t1 = at::randn(shape2, options);
+  auto t2 = at::randn(shape3, options);
+  auto t3 = at::randn(shape3, options);
+  std::vector<c10::IValue> inputs{t0, t1, t2, t3};
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto outputs = executor_cache.runFusionWithInputs(inputs);
+
+  testValidate(executor_cache.fusion(), outputs, inputs, __LINE__, __FILE__);
+}
+
 } // namespace nvfuser