Remove Thunk::Cleanup method.

reedwm · Google-ML-Automation · commit 1e22953fe858 · 2025-02-10T17:44:35.000-08:00
The method was effectively unused, since it wasn't overridden by SequentialThunk, and so SequentialThunk wouldn't call Cleanup on its subthunks.

NcclRaggedAllToAllStartThunk overrode Cleanup to free some device buffers, but these were never freed since Cleanup was not called. The memory is now stored in DeviceMemoryHandles, which automatically free the buffers in the destructor.

PiperOrigin-RevId: 725409574
diff --git a/xla/backends/gpu/runtime/BUILD b/xla/backends/gpu/runtime/BUILD
@@ -787,6 +787,7 @@ cc_library(
         "//xla/service:collective_ops_utils",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
         "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_memory_handle",
         "//xla/stream_executor:memory_allocation",
         "//xla/stream_executor:stream",
         "//xla/tsl/platform:errors",
diff --git a/xla/backends/gpu/runtime/nccl_ragged_all_to_all_thunk.cc b/xla/backends/gpu/runtime/nccl_ragged_all_to_all_thunk.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_handle.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/platform/errors.h"
@@ -172,15 +173,16 @@ absl::Status NcclRaggedAllToAllStartThunk::Initialize(
   }
 
   if (!device_buffer_allocs_.contains(params.executor)) {
-    se::DeviceMemoryBase output_offsets_device_buffer =
-        params.executor->Allocate(config_.num_ragged_rows * sizeof(int64_t));
+    se::DeviceMemoryHandle output_offsets_device_buffer{
+        params.executor,
+        params.executor->Allocate(config_.num_ragged_rows * sizeof(int64_t))};
 
-    if (output_offsets_device_buffer.is_null()) {
+    if (output_offsets_device_buffer.memory().is_null()) {
       return absl::InternalError("Failed to allocate output offsets buffer.");
     }
 
     device_buffer_allocs_.emplace(params.executor,
-                                  output_offsets_device_buffer);
+                                  std::move(output_offsets_device_buffer));
   }
 
   if (should_use_memcpy()) {
@@ -214,19 +216,6 @@ absl::Status NcclRaggedAllToAllStartThunk::Initialize(
   return absl::OkStatus();
 }
 
-absl::Status NcclRaggedAllToAllStartThunk::Cleanup(
-    const CleanupParams& params) {
-  absl::MutexLock lock(&mutex_);
-
-  if (device_buffer_allocs_.contains(params.executor)) {
-    se::DeviceMemoryBase alloc =
-        device_buffer_allocs_.extract(params.executor).mapped();
-    params.executor->Deallocate(&alloc);
-  }
-
-  return absl::OkStatus();
-}
-
 bool NcclRaggedAllToAllStartThunk::is_local() const {
   CHECK_NE(device_count_, -1);
   for (const auto& replica_group : config_.config.replica_groups) {
@@ -267,7 +256,7 @@ absl::Status NcclRaggedAllToAllStartThunk::RunNcclCollective(
 
     auto jt = device_buffer_allocs_.find(stream.parent());
     CHECK(jt != device_buffer_allocs_.end());
-    output_offsets_device_buffer = jt->second;
+    output_offsets_device_buffer = jt->second.memory();
   }
 
   if (should_use_memcpy()) {
diff --git a/xla/backends/gpu/runtime/nccl_ragged_all_to_all_thunk.h b/xla/backends/gpu/runtime/nccl_ragged_all_to_all_thunk.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_handle.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream.h"
 
@@ -61,8 +62,6 @@ class NcclRaggedAllToAllStartThunk : public NcclCollectiveThunk {
 
   absl::Status Initialize(const InitializeParams& params) override;
 
-  absl::Status Cleanup(const CleanupParams& params) override;
-
   static const char* GetHloOpName() { return "ragged-all-to-all-start"; }
 
   static CollectiveOpGroupMode GetGroupMode(
@@ -92,7 +91,7 @@ class NcclRaggedAllToAllStartThunk : public NcclCollectiveThunk {
                       std::vector<std::unique_ptr<se::MemoryAllocation>>>
       host_buffer_allocs_ ABSL_GUARDED_BY(mutex_);
 
-  absl::flat_hash_map<se::StreamExecutor*, se::DeviceMemoryBase>
+  absl::flat_hash_map<se::StreamExecutor*, se::DeviceMemoryHandle>
       device_buffer_allocs_ ABSL_GUARDED_BY(mutex_);
 
   absl::Mutex pointers_mutex_;
diff --git a/xla/backends/gpu/runtime/thunk.h b/xla/backends/gpu/runtime/thunk.h
@@ -423,23 +423,6 @@ class Thunk {
                   bool requires_exclusive_lock_on_gpu = false);
   };
 
-  //===--------------------------------------------------------------------===//
-  // CleanupParams
-  //===--------------------------------------------------------------------===//
-
-  // Parameters passed to Cleanup. Before returning from executable execution,
-  // thunks may need to clean up any resource allocated or registered through
-  // runtime APIs.
-  struct CleanupParams {
-    se::StreamExecutor* executor = nullptr;
-
-    // Parameters for executing collective operations.
-    CollectiveExecuteParams* collective_params = nullptr;
-
-    // Collective cliques acquired based on resource requests.
-    CollectiveCliques* collective_cliques = nullptr;
-  };
-
   //===--------------------------------------------------------------------===//
 
   Thunk(Kind kind, ThunkInfo thunk_info)
@@ -481,14 +464,6 @@ class Thunk {
   // Precondition: Initialize(initialize_params) has been called.
   virtual absl::Status ExecuteOnStream(const ExecuteParams& params) = 0;
 
-  // Cleans up any resources after thunk execution.
-  //
-  // This may be called multiple times. Its main purpose is to free up
-  // any resources occupied after initialization and execution.
-  virtual absl::Status Cleanup(const CleanupParams& params) {
-    return absl::OkStatus();
-  }
-
   static absl::string_view KindToString(Thunk::Kind kind);
 
   ExecutionStreamId execution_stream_id() const { return execution_stream_id_; }
diff --git a/xla/service/gpu/gpu_executable.cc b/xla/service/gpu/gpu_executable.cc
@@ -347,18 +347,8 @@ absl::Status ExecuteThunks(
 
   TF_RETURN_IF_ERROR(thunk_sequence.ExecuteOnStream(execute_params));
 
-  auto status =
-      MaybeSyncAndProfile(run_options, execution_timer.get(),
-                          block_host_until_done ? main_stream : nullptr);
-
-  Thunk::CleanupParams cleanup_params{
-      executor,
-      &collective_params,
-      &collective_cliques,
-  };
-  TF_RETURN_IF_ERROR(thunk_sequence.Cleanup(cleanup_params));
-
-  return status;
+  return MaybeSyncAndProfile(run_options, execution_timer.get(),
+                             block_host_until_done ? main_stream : nullptr);
 }
 
 namespace {