Cache the actual concrete Context pointer in each GpuExecutor class, …

…and stop calling gpu_context() where possible. PiperOrigin-RevId: 686600677
openxla · Oct 16, 2024 · dceda27 · dceda27
1 parent 398c632
commit dceda27
Show file tree

Hide file tree

Showing 5 changed files with 77 additions and 77 deletions.
diff --git a/xla/stream_executor/cuda/cuda_executor.cc b/xla/stream_executor/cuda/cuda_executor.cc
@@ -555,30 +555,30 @@ absl::StatusOr<DeviceMemoryBase> CudaExecutor::GetMemoryRange(
 }
 
 std::unique_ptr<ActivateContext> CudaExecutor::Activate() {
-  return std::make_unique<ScopedActivateContext>(gpu_context());
+  return std::make_unique<ScopedActivateContext>(cuda_context_);
 }
 
 CudaExecutor::~CudaExecutor() {
-  CHECK(kernel_to_gpu_binary_.empty()) << "GpuExecutor has live kernels.";
-  CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
+  CHECK(kernel_to_gpu_binary_.empty()) << "CudaExecutor has live kernels.";
+  CHECK(gpu_binary_to_module_.empty()) << "CudaExecutor has loaded modules.";
   set_context(nullptr);
 }
 
 void CudaExecutor::UnifiedMemoryDeallocate(void* location) {
-  ScopedActivateContext activation(gpu_context());
+  std::unique_ptr<ActivateContext> activation = Activate();
   CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
   auto status = cuda::ToStatus(cuMemFree(pointer));
   if (!status.ok()) {
     LOG(ERROR) << "failed to free unified memory at " << location
                << "; result: " << status;
   } else {
     VLOG(2) << "deallocated unified memory at " << location << " for context "
-            << gpu_context();
+            << cuda_context_;
   }
 }
 
 void* CudaExecutor::UnifiedMemoryAllocate(uint64_t size) {
-  ScopedActivateContext activation(gpu_context());
+  std::unique_ptr<ActivateContext> activation = Activate();
   CUdeviceptr result = 0;
   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
   auto status =
@@ -589,16 +589,17 @@ void* CudaExecutor::UnifiedMemoryAllocate(uint64_t size) {
     return nullptr;
   }
   void* ptr = reinterpret_cast<void*>(result);
-  VLOG(2) << "allocated " << ptr << " for context " << gpu_context() << " of "
+  VLOG(2) << "allocated " << ptr << " for context " << cuda_context_ << " of "
           << size << " bytes in unified memory";
   return ptr;
 }
 
 absl::Status CudaExecutor::Init() {
   TF_ASSIGN_OR_RETURN(device_, GetDevice(device_ordinal()));
-  TF_ASSIGN_OR_RETURN(Context * context,
+  TF_ASSIGN_OR_RETURN(CudaContext * context,
                       CudaContext::Create(device_ordinal(), device_));
   set_context(context);
+  cuda_context_ = context;
   TF_RETURN_IF_ERROR(GetComputeCapability(&cc_major_, &cc_minor_, device_));
   TF_ASSIGN_OR_RETURN(delay_kernels_supported_, DelayKernelIsSupported());
   return absl::OkStatus();
@@ -622,7 +623,7 @@ absl::StatusOr<ModuleHandle> CudaExecutor::LoadModuleFromCuBin(
   std::tie(module, module_refcount) = gpu_binary_to_module_[module_handle];
 
   if (module == nullptr) {
-    TF_ASSIGN_OR_RETURN(module, LoadCubin(gpu_context(), cubin));
+    TF_ASSIGN_OR_RETURN(module, LoadCubin(cuda_context_, cubin));
     module_refcount = 1;
     VLOG(3) << "Loaded CUBIN " << static_cast<const void*>(cubin)
             << " as module " << module;
@@ -642,7 +643,7 @@ absl::StatusOr<ModuleHandle> CudaExecutor::LoadModuleFromPtx(const char* ptx) {
   std::tie(module, module_refcount) = gpu_binary_to_module_[module_handle];
 
   if (module == nullptr) {
-    TF_ASSIGN_OR_RETURN(module, LoadPtx(gpu_context(), ptx));
+    TF_ASSIGN_OR_RETURN(module, LoadPtx(cuda_context_, ptx));
     VLOG(3) << "Loaded PTX " << static_cast<const void*>(ptx) << " as module "
             << module;
     module_refcount = 1;
@@ -672,7 +673,7 @@ absl::StatusOr<std::unique_ptr<Kernel>> CudaExecutor::LoadKernel(
     VLOG(2) << "getting function " << *kernel_name << " from module " << module;
     TF_ASSIGN_OR_RETURN(
         CUfunction function,
-        GetModuleFunction(gpu_context(), module, kernel_name->c_str()));
+        GetModuleFunction(cuda_context_, module, kernel_name->c_str()));
     cuda_kernel->set_gpu_function(function);
 
   } else if (spec.has_cuda_ptx_in_memory()) {
@@ -698,7 +699,7 @@ absl::StatusOr<std::unique_ptr<Kernel>> CudaExecutor::LoadKernel(
     VLOG(2) << "getting function " << *kernel_name << " from module " << module;
     TF_ASSIGN_OR_RETURN(
         CUfunction function,
-        GetModuleFunction(gpu_context(), module, kernel_name->c_str()));
+        GetModuleFunction(cuda_context_, module, kernel_name->c_str()));
     cuda_kernel->set_gpu_function(function);
 
   } else if (spec.has_in_process_symbol()) {
@@ -756,7 +757,7 @@ bool CudaExecutor::UnloadGpuBinary(ModuleHandle gpu_binary) {
   VLOG(3) << "Found CUDA module " << module << " with refcount " << refcount;
   if (--refcount == 0) {
     VLOG(3) << "Unloading CUDA module " << module;
-    UnloadCudaModule(gpu_context(), module);
+    UnloadCudaModule(cuda_context_, module);
     gpu_binary_to_module_.erase(module_it);
   }
   return true;
@@ -782,7 +783,7 @@ void CudaExecutor::UnloadKernel(const Kernel* kernel) {
 
 absl::StatusOr<ModuleHandle> CudaExecutor::LoadModule(
     const MultiModuleLoaderSpec& spec) {
-  // In GpuExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
+  // We store the pointer to the GPU binary (PTX or CUBIN) as
   // ModuleHandle::id().
   if (spec.has_cuda_cubin_in_memory()) {
     absl::MutexLock lock{&in_memory_modules_mu_};
@@ -905,15 +906,15 @@ DeviceMemoryBase CudaExecutor::Allocate(uint64_t size, int64_t memory_space) {
     return DeviceMemoryBase(nullptr, 0);
   } else if (memory_space ==
              static_cast<int64_t>(stream_executor::MemoryType::kHost)) {
-    return DeviceMemoryBase(HostAllocate(gpu_context(), size), size);
+    return DeviceMemoryBase(HostAllocate(cuda_context_, size), size);
   }
   CHECK_EQ(memory_space, 0);
-  return DeviceMemoryBase(DeviceAllocate(gpu_context(), size), size);
+  return DeviceMemoryBase(DeviceAllocate(cuda_context_, size), size);
 }
 
 absl::StatusOr<std::unique_ptr<MemoryAllocation>>
 CudaExecutor::HostMemoryAllocate(uint64_t size) {
-  auto* buffer = HostAllocate(gpu_context(), size);
+  auto* buffer = HostAllocate(cuda_context_, size);
   if (buffer == nullptr && size > 0) {
     return absl::InternalError(
         absl::StrFormat("Failed to allocate HostMemory of size %d", size));
@@ -929,25 +930,25 @@ void CudaExecutor::Deallocate(DeviceMemoryBase* mem) {
   }
   auto memory_space = status_or_memory_space.value();
   if (memory_space == MemoryType::kHost) {
-    HostDeallocate(gpu_context(), mem->opaque());
+    HostDeallocate(cuda_context_, mem->opaque());
   } else {
-    DeviceDeallocate(gpu_context(), mem->opaque());
+    DeviceDeallocate(cuda_context_, mem->opaque());
   }
 }
 
 void CudaExecutor::HostMemoryDeallocate(void* location) {
-  return HostDeallocate(gpu_context(), location);
+  return HostDeallocate(cuda_context_, location);
 }
 
 bool CudaExecutor::SynchronizeAllActivity() {
-  return gpu_context()->Synchronize().ok();
+  return cuda_context_->Synchronize().ok();
 }
 
 bool CudaExecutor::HostMemoryRegister(void* location, uint64_t size) {
   VLOG(1) << "Called StreamExecutor::HostMemoryRegister(data=" << location
           << ")";
 
-  ScopedActivateContext activation(gpu_context());
+  std::unique_ptr<ActivateContext> activation = Activate();
   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
   auto status = cuda::ToStatus(
       cuMemHostRegister(location, size, CU_MEMHOSTREGISTER_PORTABLE));
@@ -962,7 +963,7 @@ bool CudaExecutor::HostMemoryRegister(void* location, uint64_t size) {
 bool CudaExecutor::HostMemoryUnregister(void* location) {
   VLOG(1) << "Called StreamExecutor::HostUnregister(data=" << location << ")";
 
-  ScopedActivateContext activation(gpu_context());
+  std::unique_ptr<ActivateContext> activation = Activate();
   auto status = cuda::ToStatus(cuMemHostUnregister(location));
   if (!status.ok()) {
     LOG(ERROR) << "error unregistering host memory at " << location << ": "
@@ -974,7 +975,7 @@ bool CudaExecutor::HostMemoryUnregister(void* location) {
 
 absl::Status CudaExecutor::SynchronousMemZero(DeviceMemoryBase* location,
                                               uint64_t size) {
-  ScopedActivateContext activation(gpu_context());
+  std::unique_ptr<ActivateContext> activation = Activate();
   CUdeviceptr cuda_location = AsCudaDevicePtr(location);
   if (reinterpret_cast<uintptr_t>(location->opaque()) % sizeof(uint32_t) == 0 &&
       size % sizeof(uint32_t) == 0) {
@@ -989,7 +990,7 @@ absl::Status CudaExecutor::SynchronousMemZero(DeviceMemoryBase* location,
 absl::Status CudaExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
                                              const void* host_src,
                                              uint64_t size) {
-  ScopedActivateContext activation(gpu_context());
+  std::unique_ptr<ActivateContext> activation = Activate();
   TF_RETURN_IF_ERROR(cuda::ToStatus(
       cuMemcpyHtoD(AsCudaDevicePtr(gpu_dst), host_src, size),
       absl::StrFormat(
@@ -1003,7 +1004,7 @@ absl::Status CudaExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
 absl::Status CudaExecutor::SynchronousMemcpy(void* host_dst,
                                              const DeviceMemoryBase& gpu_src,
                                              uint64_t size) {
-  ScopedActivateContext activation(gpu_context());
+  std::unique_ptr<ActivateContext> activation = Activate();
   TF_RETURN_IF_ERROR(cuda::ToStatus(
       cuMemcpyDtoH(host_dst, AsCudaDevicePtr(gpu_src), size),
       absl::StrFormat("failed to synchronous memcpy from device to host "
@@ -1026,7 +1027,7 @@ void CudaExecutor::DeallocateStream(Stream* stream) {
 }
 
 absl::Status CudaExecutor::BlockHostUntilDone(Stream* stream) {
-  return GpuDriver::SynchronizeStream(gpu_context(), AsGpuStreamValue(stream));
+  return GpuDriver::SynchronizeStream(cuda_context_, AsGpuStreamValue(stream));
 }
 
 blas::BlasSupport* CudaExecutor::AsBlas() {
@@ -1091,18 +1092,18 @@ fft::FftSupport* CudaExecutor::AsFft() {
 }
 
 bool CudaExecutor::CanEnablePeerAccessTo(StreamExecutor* other) {
-  GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
-  return CanEnablePeerAccess(gpu_context(), cuda_other->gpu_context());
+  CudaExecutor* cuda_other = static_cast<CudaExecutor*>(other);
+  return CanEnablePeerAccess(cuda_context_, cuda_other->cuda_context_);
 }
 
 absl::Status CudaExecutor::EnablePeerAccessTo(StreamExecutor* other) {
-  GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
-  return EnablePeerAccess(gpu_context(), cuda_other->gpu_context());
+  CudaExecutor* cuda_other = static_cast<CudaExecutor*>(other);
+  return EnablePeerAccess(cuda_context_, cuda_other->cuda_context_);
 }
 
 bool CudaExecutor::DeviceMemoryUsage(int64_t* free_out,
                                      int64_t* total_out) const {
-  ScopedActivateContext activation(gpu_context());
+  ScopedActivateContext activation(cuda_context_);
   size_t free = 0;
   size_t total = 0;
   auto status = cuda::ToStatus(cuMemGetInfo(&free, &total));
@@ -1130,7 +1131,7 @@ absl::StatusOr<DeviceMemoryBase> CudaExecutor::GetSymbol(
     CUmodule gpu_module_handle = it->second.first;
     CHECK(gpu_module_handle != nullptr);
     TF_RETURN_IF_ERROR(
-        GetModuleSymbol(gpu_context(), gpu_module_handle, symbol_name.c_str(),
+        GetModuleSymbol(cuda_context_, gpu_module_handle, symbol_name.c_str(),
                         reinterpret_cast<CUdeviceptr*>(&mem), &bytes));
     return DeviceMemoryBase(mem, bytes);
   }

diff --git a/xla/stream_executor/cuda/cuda_executor.h b/xla/stream_executor/cuda/cuda_executor.h
@@ -1,4 +1,5 @@
 #include "xla/stream_executor/activate_context.h"
+#include "xla/stream_executor/cuda/cuda_context.h"
 /* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -207,6 +208,9 @@ class CudaExecutor : public GpuExecutor {
   // Lookup map for alive streams, from raw stream pointers.
   absl::flat_hash_map<void*, Stream*> alive_gpu_streams_
       ABSL_GUARDED_BY(alive_gpu_streams_mu_);
+
+  // CudaContext for this device.
+  CudaContext* cuda_context_;
 };
 
 }  // namespace stream_executor::gpu

diff --git a/xla/stream_executor/gpu/gpu_executor.h b/xla/stream_executor/gpu/gpu_executor.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define XLA_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_
 
 #include <cstdint>
-#include <memory>
 #include <utility>
 #include <variant>
 #include <vector>
@@ -26,12 +25,8 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/event_based_timer.h"
 #include "xla/stream_executor/gpu/context.h"
 #include "xla/stream_executor/host_memory_allocation.h"
-#include "xla/stream_executor/kernel.h"
-#include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_common.h"