PR tensorflow#22437: Added frontend attribute handling to explicit_stream_annotation_async_wrapper

chaserileyroberts · tensorflower-gardener · commit d401ee1c4ad9 · 2025-02-27T20:01:27.000-08:00
Imported from GitHub PR openxla/xla#22437 This is a small change that ensures the frontend attributes are correctly passed to both the `async-start` and `async-done` created pairs. This also clears the scheduling attributes that are directly on the call operation and inner ops. The specific goal of this change is to have stable support combining the scheduling group ids with stream annotation in JAX. ```python with set_xla_metadata(_scheduling_group_id=1): result = compute_on("gpu_stream:1")(jitted_func)(...) ``` Currently, the issue stems from the `set_xla_metadata` context manager, which will apply the frontend attribute to all operations, including the ones within our `jitted_func`. When the same scheduling annotations is found in two `HloComputation`s, an error is raised in `LegalizeSchedulingAnnotations`. This is intended to avoid hitting this check, and cleaning up the annotations on the wrapped streamed computation. Copybara import of the project: -- 994c2eee3c946102270587681f5c17b994cbb6a9 by chaser <chaser@nvidia.com>: Added frontend attributed handling -- 9db58b2b988dc2288d42126271223f924aac19f9 by chaser <chaser@nvidia.com>: Added clearing of scheduling annotations -- a83e32a34ba5d64a29c7f01b03536f27decd8125 by chaser <chaser@nvidia.com>: Added HloInstruction.erase_frontend_attribute Merging this change closes tensorflow#22437 PiperOrigin-RevId: 731960979
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction.h b/third_party/xla/xla/hlo/ir/hlo_instruction.h
@@ -1871,6 +1871,10 @@ class HloInstruction {
     return it.second;
   }
 
+  size_t erase_frontend_attribute(const std::string& key) {
+    return mutable_rare()->frontend_attributes.mutable_map()->erase(key);
+  }
+
   // Adds or overrides a single attribute in the HloInstruction.
   void set_frontend_attribute(const std::string& key,
                               const std::string& value) {
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction_test.cc b/third_party/xla/xla/hlo/ir/hlo_instruction_test.cc
@@ -61,5 +61,17 @@ TEST(HloInstruction, AddFrontendAttributes) {
   EXPECT_EQ(instr.get_frontend_attribute("key2").value(), "value2");
 }
 
+TEST(HloInstruction, EraseFrontendAttribute) {
+  HloConstantInstruction instr(ShapeUtil::MakeShape(U32, {3, 2}));
+  instr.add_frontend_attribute("key1", "value1");
+  instr.add_frontend_attribute("key2", "value2");
+  EXPECT_EQ(instr.erase_frontend_attribute("key2"), 1);
+  EXPECT_EQ(instr.erase_frontend_attribute("not_a_key"), 0);
+  EXPECT_EQ(instr.get_frontend_attribute("key1").value(), "value1")
+      << "key1 should not be erased";
+  EXPECT_EQ(instr.get_frontend_attribute("key2"), std::nullopt)
+      << "key2 should have been erased";
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/BUILD b/third_party/xla/xla/service/gpu/transforms/BUILD
@@ -1608,6 +1608,7 @@ xla_cc_test(
     srcs = ["explicit_stream_annotation_async_wrapper_test.cc"],
     deps = [
         ":explicit_stream_annotation_async_wrapper",
+        "//xla:side_effect_util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/service/gpu:backend_configs_cc",
diff --git a/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper.cc b/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper.cc
@@ -33,21 +33,41 @@ limitations under the License.
 namespace xla::gpu {
 
 namespace {
+
+void ClearSchedulingAnnotations(HloInstruction* instr) {
+  // These attributes are only valid on the async pairs.
+  instr->erase_frontend_attribute(kXlaSchedulingGroupIdAttr);
+  instr->erase_frontend_attribute(kXlaStreamAnnotationAttr);
+}
+
 static absl::StatusOr<bool> AsynchronizeInstruction(HloInstruction* instr) {
   if (instr->opcode() != HloOpcode::kCall ||
       !instr->frontend_attributes().map().contains(kXlaStreamAnnotationAttr)) {
     return false;
   }
   HloComputation* computation = instr->parent();
+  auto original_attributes = instr->frontend_attributes();
+
+  // These annotations are only legal on the async instructions and
+  // can cause issues if the annotations remain on the inner operations,
+  // so we clear them before creating the async pair.
+  for (auto* inner_instr : instr->called_computations()[0]->instructions()) {
+    ClearSchedulingAnnotations(inner_instr);
+  }
+  ClearSchedulingAnnotations(instr);
+
   TF_ASSIGN_OR_RETURN(
       HloInstruction * done,
       computation->CreateAsyncInstructions(
           instr, {},
           ExplicitStreamAnnotationAsyncWrapper::kExplicitExecutionThread,
           /*replace=*/true));
+  // Replace the original attributes after creating the async pair.
+  done->set_frontend_attributes(original_attributes);
+  done->mutable_operand(0)->set_frontend_attributes(original_attributes);
   TF_ASSIGN_OR_RETURN(GpuBackendConfig gpu_config,
                       done->backend_config<GpuBackendConfig>());
-  // Set the false delay of done op to be false so it can be scheduled
+  // Set earliest schedule of done op to be false so it can be scheduled
   // far apart from start.
   gpu_config.set_force_earliest_schedule(false);
   TF_RETURN_IF_ERROR(done->set_backend_config(gpu_config));
diff --git a/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper_test.cc b/third_party/xla/xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/side_effect_util.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
@@ -72,21 +73,23 @@ TEST_F(ExplicitStreamAnnotationAsyncWrapperTest, OverlappingGemms) {
   %gemm1 (z: f32[2048,2048], w: f32[2048,2048]) -> f32[2048,2048] {
     %w = f32[2048,2048]{1,0} parameter(1)
     %z = f32[2048,2048]{1,0} parameter(0)
-    %custom-call.1 = (f32[2048,2048]{1,0}, s8[33554432]{0}) custom-call(f32[2048,2048]{1,0} %w, f32[2048,2048]{1,0} %z), custom_call_target="__cublas$gemm"
+    %custom-call.1 = (f32[2048,2048]{1,0}, s8[33554432]{0}) custom-call(f32[2048,2048]{1,0} %w, f32[2048,2048]{1,0} %z), custom_call_target="__cublas$gemm", 
+      frontend_attributes={_scheduling_group_id="0", _xla_stream_annotation="1"}
     ROOT %get-tuple-element = f32[2048,2048]{1,0} get-tuple-element((f32[2048,2048]{1,0}, s8[33554432]{0}) %custom-call.1), index=0
   }
   %gemm2 (a: f32[2048,2048], b: f32[2048,2048]) -> f32[2048,2048] {
     %a = f32[2048,2048]{1,0} parameter(1)
     %b = f32[2048,2048]{1,0} parameter(0)
-    %custom-call.2 = (f32[2048,2048]{1,0}, s8[33554432]{0}) custom-call(f32[2048,2048]{1,0} %a, f32[2048,2048]{1,0} %b), custom_call_target="__cublas$gemm"
+    %custom-call.2 = (f32[2048,2048]{1,0}, s8[33554432]{0}) custom-call(f32[2048,2048]{1,0} %a, f32[2048,2048]{1,0} %b), custom_call_target="__cublas$gemm",
+          frontend_attributes={_scheduling_group_id="1", _xla_stream_annotation="2"}
     ROOT %get-tuple-element = f32[2048,2048]{1,0} get-tuple-element((f32[2048,2048]{1,0}, s8[33554432]{0}) %custom-call.2), index=0
   }
 
   ENTRY %main () -> f32[2048,2048]{1,0} {
     %x = f32[2048,2048]{1,0} parameter(1), metadata={op_name="b" scheduling_name="x"}
     %y = f32[2048,2048]{1,0} parameter(0), metadata={op_name="a" scheduling_name="y"}
-    %call1 =  f32[2048,2048]{1,0} call(f32[2048,2048]{1,0} %x, f32[2048,2048]{1,0} %y ), to_apply=%gemm1, frontend_attributes={_xla_stream_annotation="1"}
-    ROOT %call2 =  f32[2048,2048]{1,0} call(f32[2048,2048]{1,0} %x, f32[2048,2048]{1,0} %y), to_apply=%gemm2, frontend_attributes={_xla_stream_annotation="2"}
+    %call1 =  f32[2048,2048]{1,0} call(f32[2048,2048]{1,0} %x, f32[2048,2048]{1,0} %y ), to_apply=%gemm1, frontend_attributes={_scheduling_group_id="0", _xla_stream_annotation="2"}
+    ROOT %call2 =  f32[2048,2048]{1,0} call(f32[2048,2048]{1,0} %x, f32[2048,2048]{1,0} %y), to_apply=%gemm2, frontend_attributes={_scheduling_group_id="1", _xla_stream_annotation="1"}
   })";
 
   auto debug_options = HloTestBase::GetDebugOptionsForTest();
@@ -96,16 +99,55 @@ TEST_F(ExplicitStreamAnnotationAsyncWrapperTest, OverlappingGemms) {
   ExplicitStreamAnnotationAsyncWrapper wrapper_pass;
 
   TF_ASSERT_OK_AND_ASSIGN(bool mutated, wrapper_pass.Run(module.get()));
+  ASSERT_TRUE(mutated);
+
   absl::StatusOr<bool> filecheck_result = RunFileCheck(module->ToString({}), R"(
-  // CHECK: %call-start = ((f32[2048,2048]{1,0}, f32[2048,2048]{1,0}), f32[2048,2048]{1,0}) call-start(f32[2048,2048]{1,0} %x, f32[2048,2048]{1,0} %y), async_execution_thread="explicit", to_apply=%gemm1, frontend_attributes={_xla_stream_annotation="1"} 
-  // CHECK: %call-done = f32[2048,2048]{1,0} call-done(((f32[2048,2048]{1,0}, f32[2048,2048]{1,0}), f32[2048,2048]{1,0}) %call-start), frontend_attributes={_xla_stream_annotation="1"}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"force_earliest_schedule":false}
-  // CHECK: %call-start.1 = ((f32[2048,2048]{1,0}, f32[2048,2048]{1,0}), f32[2048,2048]{1,0}) call-start(f32[2048,2048]{1,0} %x, f32[2048,2048]{1,0} %y), async_execution_thread="explicit", to_apply=%gemm2, frontend_attributes={_xla_stream_annotation="2"}
-  // CHECK: ROOT %call-done.1 = f32[2048,2048]{1,0} call-done(((f32[2048,2048]{1,0}, f32[2048,2048]{1,0}), f32[2048,2048]{1,0}) %call-start.1), frontend_attributes={_xla_stream_annotation="2"}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"force_earliest_schedule":false}
+  // CHECK: %call-start = ((f32[2048,2048]{1,0}, f32[2048,2048]{1,0}), f32[2048,2048]{1,0}) call-start(f32[2048,2048]{1,0} %x, f32[2048,2048]{1,0} %y), async_execution_thread="explicit", to_apply=%gemm1, frontend_attributes={_scheduling_group_id="0",_xla_stream_annotation="2"}
+  // CHECK: %call-done = f32[2048,2048]{1,0} call-done(((f32[2048,2048]{1,0}, f32[2048,2048]{1,0}), f32[2048,2048]{1,0}) %call-start), frontend_attributes={_scheduling_group_id="0",_xla_stream_annotation="2"}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"force_earliest_schedule":false}
+  // CHECK: %call-start.1 = ((f32[2048,2048]{1,0}, f32[2048,2048]{1,0}), f32[2048,2048]{1,0}) call-start(f32[2048,2048]{1,0} %x, f32[2048,2048]{1,0} %y), async_execution_thread="explicit", to_apply=%gemm2, frontend_attributes={_scheduling_group_id="1",_xla_stream_annotation="1"} 
+  // CHECK: ROOT %call-done.1 = f32[2048,2048]{1,0} call-done(((f32[2048,2048]{1,0}, f32[2048,2048]{1,0}), f32[2048,2048]{1,0}) %call-start.1), frontend_attributes={_scheduling_group_id="1",_xla_stream_annotation="1"}, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"force_earliest_schedule":false}
   )");
   TF_ASSERT_OK(filecheck_result.status());
   EXPECT_TRUE(*filecheck_result);
-
-  ASSERT_TRUE(mutated);
+  for (auto name : {"call-start", "call-done"}) {
+    EXPECT_EQ(FindInstruction(module.get(), name)
+                  ->frontend_attributes()
+                  .map()
+                  .find(kXlaStreamAnnotationAttr)
+                  ->second,
+              "2");
+    EXPECT_EQ(FindInstruction(module.get(), name)
+                  ->frontend_attributes()
+                  .map()
+                  .find(kXlaSchedulingGroupIdAttr)
+                  ->second,
+              "0");
+  }
+  for (auto name : {"call-start.1", "call-done.1"}) {
+    EXPECT_EQ(FindInstruction(module.get(), name)
+                  ->frontend_attributes()
+                  .map()
+                  .find(kXlaStreamAnnotationAttr)
+                  ->second,
+              "1");
+    EXPECT_EQ(FindInstruction(module.get(), name)
+                  ->frontend_attributes()
+                  .map()
+                  .find(kXlaSchedulingGroupIdAttr)
+                  ->second,
+              "1");
+  }
+  // Ensure the operations within the async computation are not annotated
+  // anymore.
+  for (auto annotation :
+       {kXlaSchedulingGroupIdAttr, kXlaStreamAnnotationAttr}) {
+    for (auto name : {"custom-call.1", "custom-call.2"}) {
+      EXPECT_FALSE(FindInstruction(module.get(), name)
+                       ->frontend_attributes()
+                       .map()
+                       .contains(annotation));
+    }
+  }
 }
 }  // namespace
 }  // namespace xla::gpu