Skip to content

Commit

Permalink
Cache the actual concrete Context pointer in each GpuExecutor class, …
Browse files Browse the repository at this point in the history
…and stop calling gpu_context() where possible.

PiperOrigin-RevId: 686600677
  • Loading branch information
klucke authored and Google-ML-Automation committed Oct 16, 2024
1 parent 398c632 commit dceda27
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 77 deletions.
67 changes: 34 additions & 33 deletions xla/stream_executor/cuda/cuda_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -555,30 +555,30 @@ absl::StatusOr<DeviceMemoryBase> CudaExecutor::GetMemoryRange(
}

std::unique_ptr<ActivateContext> CudaExecutor::Activate() {
return std::make_unique<ScopedActivateContext>(gpu_context());
return std::make_unique<ScopedActivateContext>(cuda_context_);
}

CudaExecutor::~CudaExecutor() {
CHECK(kernel_to_gpu_binary_.empty()) << "GpuExecutor has live kernels.";
CHECK(gpu_binary_to_module_.empty()) << "GpuExecutor has loaded modules.";
CHECK(kernel_to_gpu_binary_.empty()) << "CudaExecutor has live kernels.";
CHECK(gpu_binary_to_module_.empty()) << "CudaExecutor has loaded modules.";
set_context(nullptr);
}

void CudaExecutor::UnifiedMemoryDeallocate(void* location) {
ScopedActivateContext activation(gpu_context());
std::unique_ptr<ActivateContext> activation = Activate();
CUdeviceptr pointer = absl::bit_cast<CUdeviceptr>(location);
auto status = cuda::ToStatus(cuMemFree(pointer));
if (!status.ok()) {
LOG(ERROR) << "failed to free unified memory at " << location
<< "; result: " << status;
} else {
VLOG(2) << "deallocated unified memory at " << location << " for context "
<< gpu_context();
<< cuda_context_;
}
}

void* CudaExecutor::UnifiedMemoryAllocate(uint64_t size) {
ScopedActivateContext activation(gpu_context());
std::unique_ptr<ActivateContext> activation = Activate();
CUdeviceptr result = 0;
// "Portable" memory is visible to all CUDA contexts. Safe for our use model.
auto status =
Expand All @@ -589,16 +589,17 @@ void* CudaExecutor::UnifiedMemoryAllocate(uint64_t size) {
return nullptr;
}
void* ptr = reinterpret_cast<void*>(result);
VLOG(2) << "allocated " << ptr << " for context " << gpu_context() << " of "
VLOG(2) << "allocated " << ptr << " for context " << cuda_context_ << " of "
<< size << " bytes in unified memory";
return ptr;
}

absl::Status CudaExecutor::Init() {
TF_ASSIGN_OR_RETURN(device_, GetDevice(device_ordinal()));
TF_ASSIGN_OR_RETURN(Context * context,
TF_ASSIGN_OR_RETURN(CudaContext * context,
CudaContext::Create(device_ordinal(), device_));
set_context(context);
cuda_context_ = context;
TF_RETURN_IF_ERROR(GetComputeCapability(&cc_major_, &cc_minor_, device_));
TF_ASSIGN_OR_RETURN(delay_kernels_supported_, DelayKernelIsSupported());
return absl::OkStatus();
Expand All @@ -622,7 +623,7 @@ absl::StatusOr<ModuleHandle> CudaExecutor::LoadModuleFromCuBin(
std::tie(module, module_refcount) = gpu_binary_to_module_[module_handle];

if (module == nullptr) {
TF_ASSIGN_OR_RETURN(module, LoadCubin(gpu_context(), cubin));
TF_ASSIGN_OR_RETURN(module, LoadCubin(cuda_context_, cubin));
module_refcount = 1;
VLOG(3) << "Loaded CUBIN " << static_cast<const void*>(cubin)
<< " as module " << module;
Expand All @@ -642,7 +643,7 @@ absl::StatusOr<ModuleHandle> CudaExecutor::LoadModuleFromPtx(const char* ptx) {
std::tie(module, module_refcount) = gpu_binary_to_module_[module_handle];

if (module == nullptr) {
TF_ASSIGN_OR_RETURN(module, LoadPtx(gpu_context(), ptx));
TF_ASSIGN_OR_RETURN(module, LoadPtx(cuda_context_, ptx));
VLOG(3) << "Loaded PTX " << static_cast<const void*>(ptx) << " as module "
<< module;
module_refcount = 1;
Expand Down Expand Up @@ -672,7 +673,7 @@ absl::StatusOr<std::unique_ptr<Kernel>> CudaExecutor::LoadKernel(
VLOG(2) << "getting function " << *kernel_name << " from module " << module;
TF_ASSIGN_OR_RETURN(
CUfunction function,
GetModuleFunction(gpu_context(), module, kernel_name->c_str()));
GetModuleFunction(cuda_context_, module, kernel_name->c_str()));
cuda_kernel->set_gpu_function(function);

} else if (spec.has_cuda_ptx_in_memory()) {
Expand All @@ -698,7 +699,7 @@ absl::StatusOr<std::unique_ptr<Kernel>> CudaExecutor::LoadKernel(
VLOG(2) << "getting function " << *kernel_name << " from module " << module;
TF_ASSIGN_OR_RETURN(
CUfunction function,
GetModuleFunction(gpu_context(), module, kernel_name->c_str()));
GetModuleFunction(cuda_context_, module, kernel_name->c_str()));
cuda_kernel->set_gpu_function(function);

} else if (spec.has_in_process_symbol()) {
Expand Down Expand Up @@ -756,7 +757,7 @@ bool CudaExecutor::UnloadGpuBinary(ModuleHandle gpu_binary) {
VLOG(3) << "Found CUDA module " << module << " with refcount " << refcount;
if (--refcount == 0) {
VLOG(3) << "Unloading CUDA module " << module;
UnloadCudaModule(gpu_context(), module);
UnloadCudaModule(cuda_context_, module);
gpu_binary_to_module_.erase(module_it);
}
return true;
Expand All @@ -782,7 +783,7 @@ void CudaExecutor::UnloadKernel(const Kernel* kernel) {

absl::StatusOr<ModuleHandle> CudaExecutor::LoadModule(
const MultiModuleLoaderSpec& spec) {
// In GpuExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
// We store the pointer to the GPU binary (PTX or CUBIN) as
// ModuleHandle::id().
if (spec.has_cuda_cubin_in_memory()) {
absl::MutexLock lock{&in_memory_modules_mu_};
Expand Down Expand Up @@ -905,15 +906,15 @@ DeviceMemoryBase CudaExecutor::Allocate(uint64_t size, int64_t memory_space) {
return DeviceMemoryBase(nullptr, 0);
} else if (memory_space ==
static_cast<int64_t>(stream_executor::MemoryType::kHost)) {
return DeviceMemoryBase(HostAllocate(gpu_context(), size), size);
return DeviceMemoryBase(HostAllocate(cuda_context_, size), size);
}
CHECK_EQ(memory_space, 0);
return DeviceMemoryBase(DeviceAllocate(gpu_context(), size), size);
return DeviceMemoryBase(DeviceAllocate(cuda_context_, size), size);
}

absl::StatusOr<std::unique_ptr<MemoryAllocation>>
CudaExecutor::HostMemoryAllocate(uint64_t size) {
auto* buffer = HostAllocate(gpu_context(), size);
auto* buffer = HostAllocate(cuda_context_, size);
if (buffer == nullptr && size > 0) {
return absl::InternalError(
absl::StrFormat("Failed to allocate HostMemory of size %d", size));
Expand All @@ -929,25 +930,25 @@ void CudaExecutor::Deallocate(DeviceMemoryBase* mem) {
}
auto memory_space = status_or_memory_space.value();
if (memory_space == MemoryType::kHost) {
HostDeallocate(gpu_context(), mem->opaque());
HostDeallocate(cuda_context_, mem->opaque());
} else {
DeviceDeallocate(gpu_context(), mem->opaque());
DeviceDeallocate(cuda_context_, mem->opaque());
}
}

void CudaExecutor::HostMemoryDeallocate(void* location) {
return HostDeallocate(gpu_context(), location);
return HostDeallocate(cuda_context_, location);
}

bool CudaExecutor::SynchronizeAllActivity() {
return gpu_context()->Synchronize().ok();
return cuda_context_->Synchronize().ok();
}

bool CudaExecutor::HostMemoryRegister(void* location, uint64_t size) {
VLOG(1) << "Called StreamExecutor::HostMemoryRegister(data=" << location
<< ")";

ScopedActivateContext activation(gpu_context());
std::unique_ptr<ActivateContext> activation = Activate();
// "Portable" memory is visible to all CUDA contexts. Safe for our use model.
auto status = cuda::ToStatus(
cuMemHostRegister(location, size, CU_MEMHOSTREGISTER_PORTABLE));
Expand All @@ -962,7 +963,7 @@ bool CudaExecutor::HostMemoryRegister(void* location, uint64_t size) {
bool CudaExecutor::HostMemoryUnregister(void* location) {
VLOG(1) << "Called StreamExecutor::HostUnregister(data=" << location << ")";

ScopedActivateContext activation(gpu_context());
std::unique_ptr<ActivateContext> activation = Activate();
auto status = cuda::ToStatus(cuMemHostUnregister(location));
if (!status.ok()) {
LOG(ERROR) << "error unregistering host memory at " << location << ": "
Expand All @@ -974,7 +975,7 @@ bool CudaExecutor::HostMemoryUnregister(void* location) {

absl::Status CudaExecutor::SynchronousMemZero(DeviceMemoryBase* location,
uint64_t size) {
ScopedActivateContext activation(gpu_context());
std::unique_ptr<ActivateContext> activation = Activate();
CUdeviceptr cuda_location = AsCudaDevicePtr(location);
if (reinterpret_cast<uintptr_t>(location->opaque()) % sizeof(uint32_t) == 0 &&
size % sizeof(uint32_t) == 0) {
Expand All @@ -989,7 +990,7 @@ absl::Status CudaExecutor::SynchronousMemZero(DeviceMemoryBase* location,
absl::Status CudaExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
const void* host_src,
uint64_t size) {
ScopedActivateContext activation(gpu_context());
std::unique_ptr<ActivateContext> activation = Activate();
TF_RETURN_IF_ERROR(cuda::ToStatus(
cuMemcpyHtoD(AsCudaDevicePtr(gpu_dst), host_src, size),
absl::StrFormat(
Expand All @@ -1003,7 +1004,7 @@ absl::Status CudaExecutor::SynchronousMemcpy(DeviceMemoryBase* gpu_dst,
absl::Status CudaExecutor::SynchronousMemcpy(void* host_dst,
const DeviceMemoryBase& gpu_src,
uint64_t size) {
ScopedActivateContext activation(gpu_context());
std::unique_ptr<ActivateContext> activation = Activate();
TF_RETURN_IF_ERROR(cuda::ToStatus(
cuMemcpyDtoH(host_dst, AsCudaDevicePtr(gpu_src), size),
absl::StrFormat("failed to synchronous memcpy from device to host "
Expand All @@ -1026,7 +1027,7 @@ void CudaExecutor::DeallocateStream(Stream* stream) {
}

absl::Status CudaExecutor::BlockHostUntilDone(Stream* stream) {
return GpuDriver::SynchronizeStream(gpu_context(), AsGpuStreamValue(stream));
return GpuDriver::SynchronizeStream(cuda_context_, AsGpuStreamValue(stream));
}

blas::BlasSupport* CudaExecutor::AsBlas() {
Expand Down Expand Up @@ -1091,18 +1092,18 @@ fft::FftSupport* CudaExecutor::AsFft() {
}

bool CudaExecutor::CanEnablePeerAccessTo(StreamExecutor* other) {
GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
return CanEnablePeerAccess(gpu_context(), cuda_other->gpu_context());
CudaExecutor* cuda_other = static_cast<CudaExecutor*>(other);
return CanEnablePeerAccess(cuda_context_, cuda_other->cuda_context_);
}

absl::Status CudaExecutor::EnablePeerAccessTo(StreamExecutor* other) {
GpuExecutor* cuda_other = static_cast<GpuExecutor*>(other);
return EnablePeerAccess(gpu_context(), cuda_other->gpu_context());
CudaExecutor* cuda_other = static_cast<CudaExecutor*>(other);
return EnablePeerAccess(cuda_context_, cuda_other->cuda_context_);
}

bool CudaExecutor::DeviceMemoryUsage(int64_t* free_out,
int64_t* total_out) const {
ScopedActivateContext activation(gpu_context());
ScopedActivateContext activation(cuda_context_);
size_t free = 0;
size_t total = 0;
auto status = cuda::ToStatus(cuMemGetInfo(&free, &total));
Expand Down Expand Up @@ -1130,7 +1131,7 @@ absl::StatusOr<DeviceMemoryBase> CudaExecutor::GetSymbol(
CUmodule gpu_module_handle = it->second.first;
CHECK(gpu_module_handle != nullptr);
TF_RETURN_IF_ERROR(
GetModuleSymbol(gpu_context(), gpu_module_handle, symbol_name.c_str(),
GetModuleSymbol(cuda_context_, gpu_module_handle, symbol_name.c_str(),
reinterpret_cast<CUdeviceptr*>(&mem), &bytes));
return DeviceMemoryBase(mem, bytes);
}
Expand Down
4 changes: 4 additions & 0 deletions xla/stream_executor/cuda/cuda_executor.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "xla/stream_executor/activate_context.h"
#include "xla/stream_executor/cuda/cuda_context.h"
/* Copyright 2024 The OpenXLA Authors.
Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -207,6 +208,9 @@ class CudaExecutor : public GpuExecutor {
// Lookup map for alive streams, from raw stream pointers.
absl::flat_hash_map<void*, Stream*> alive_gpu_streams_
ABSL_GUARDED_BY(alive_gpu_streams_mu_);

// CudaContext for this device.
CudaContext* cuda_context_;
};

} // namespace stream_executor::gpu
Expand Down
5 changes: 0 additions & 5 deletions xla/stream_executor/gpu/gpu_executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ limitations under the License.
#define XLA_STREAM_EXECUTOR_GPU_GPU_EXECUTOR_H_

#include <cstdint>
#include <memory>
#include <utility>
#include <variant>
#include <vector>
Expand All @@ -26,12 +25,8 @@ limitations under the License.
#include "absl/status/status.h"
#include "absl/status/statusor.h"
#include "absl/synchronization/mutex.h"
#include "xla/stream_executor/device_description.h"
#include "xla/stream_executor/event_based_timer.h"
#include "xla/stream_executor/gpu/context.h"
#include "xla/stream_executor/host_memory_allocation.h"
#include "xla/stream_executor/kernel.h"
#include "xla/stream_executor/kernel_spec.h"
#include "xla/stream_executor/platform.h"
#include "xla/stream_executor/stream_executor.h"
#include "xla/stream_executor/stream_executor_common.h"
Expand Down
Loading

0 comments on commit dceda27

Please sign in to comment.