Skip to content

Commit

Permalink
Revert "Enable Cuda in Graphics Implementation for TensorRT backend (#…
Browse files Browse the repository at this point in the history
…100)" (#105)

This reverts commit ab13c10.
  • Loading branch information
mc-nv authored Jan 30, 2025
1 parent ab13c10 commit 2b060ca
Show file tree
Hide file tree
Showing 6 changed files with 25 additions and 189 deletions.
13 changes: 0 additions & 13 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,6 @@ set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard which feat
option(TRITON_ENABLE_GPU "Enable GPU support in backend." ON)
option(TRITON_ENABLE_STATS "Include statistics collections in backend." ON)
option(TRITON_ENABLE_NVTX "Include nvtx markers collection in backend." OFF)
option(TRITON_ENABLE_CUDA_CTX_SHARING "Enable Cuda context sharing support in backend." OFF)

set(TRITON_TENSORRT_LIB_PATHS "" CACHE PATH "Paths to TensorRT libraries. Multiple paths may be specified by separating them with a semicolon.")
set(TRITON_TENSORRT_INCLUDE_PATHS "" CACHE PATH "Paths to TensorRT includes. Multiple paths may be specified by separating them with a semicolon.")

Expand Down Expand Up @@ -234,17 +232,6 @@ target_link_libraries(
CUDA::cudart
)

if(${TRITON_ENABLE_CUDA_CTX_SHARING})
target_compile_definitions(
triton-tensorrt-backend
PRIVATE TRITON_ENABLE_CUDA_CTX_SHARING
)
target_link_libraries(
triton-tensorrt-backend
PRIVATE
CUDA::cuda_driver
)
endif()

#
# Install
Expand Down
25 changes: 9 additions & 16 deletions src/instance_state.cc
Original file line number Diff line number Diff line change
Expand Up @@ -257,9 +257,7 @@ ModelInstanceState::ModelInstanceState(

ModelInstanceState::~ModelInstanceState()
{
if (!model_state_->IsCudaContextSharingEnabled()) {
cudaSetDevice(DeviceId());
}
cudaSetDevice(DeviceId());
for (auto& io_binding_infos : io_binding_infos_) {
for (auto& io_binding_info : io_binding_infos) {
if (!io_binding_info.IsDynamicShapeOutput() &&
Expand Down Expand Up @@ -426,9 +424,7 @@ ModelInstanceState::Run(
payload_.reset(new Payload(next_set_, requests, request_count));
SET_TIMESTAMP(payload_->compute_start_ns_);

if (!model_state_->IsCudaContextSharingEnabled()) {
cudaSetDevice(DeviceId());
}
cudaSetDevice(DeviceId());
#ifdef TRITON_ENABLE_STATS
{
SET_TIMESTAMP(payload_->compute_start_ns_);
Expand Down Expand Up @@ -1555,16 +1551,13 @@ ModelInstanceState::EvaluateTensorRTContext(
TRITONSERVER_Error*
ModelInstanceState::InitStreamsAndEvents()
{
if (!model_state_->IsCudaContextSharingEnabled()) {
// Set the device before preparing the context.
auto cuerr = cudaSetDevice(DeviceId());
if (cuerr != cudaSuccess) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string("unable to set device for ") + Name() + ": " +
cudaGetErrorString(cuerr))
.c_str());
}
// Set the device before preparing the context.
auto cuerr = cudaSetDevice(DeviceId());
if (cuerr != cudaSuccess) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL, (std::string("unable to set device for ") +
Name() + ": " + cudaGetErrorString(cuerr))
.c_str());
}

// Create CUDA streams associated with the instance
Expand Down
49 changes: 16 additions & 33 deletions src/model_state.cc
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,7 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model)
ModelState::~ModelState()
{
for (auto& device_engine : device_engines_) {
if (!IsCudaContextSharingEnabled()) {
cudaSetDevice(device_engine.first.first);
}
cudaSetDevice(device_engine.first.first);
auto& runtime = device_engine.second.first;
auto& engine = device_engine.second.second;
// Need to reset explicitly to ensure proper destruction order
Expand Down Expand Up @@ -211,16 +209,15 @@ ModelState::CreateEngine(
// We share the engine (for models that don't have dynamic shapes) and
// runtime across instances that have access to the same GPU/NVDLA.
if (eit->second.second == nullptr) {
if (!IsCudaContextSharingEnabled()) {
auto cuerr = cudaSetDevice(gpu_device);
if (cuerr != cudaSuccess) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string("unable to set device for ") + Name() + ": " +
cudaGetErrorString(cuerr))
.c_str());
}
auto cuerr = cudaSetDevice(gpu_device);
if (cuerr != cudaSuccess) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string("unable to set device for ") + Name() + ": " +
cudaGetErrorString(cuerr))
.c_str());
}

const bool new_runtime = (eit->second.first == nullptr);
RETURN_IF_ERROR(LoadPlan(
model_path, dla_core_id, &eit->second.first, &eit->second.second,
Expand Down Expand Up @@ -324,18 +321,6 @@ ModelState::AutoCompleteConfig()
" to auto-complete config for " + Name())
.c_str()));

#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
// Return failure if Cuda context sharing is enabled and
// if it is a multi GPU setup
if (IsCudaContextSharingEnabled() && device_id != 0) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string(
"Cuda context sharing is not supported on multi-GPU system."))
.c_str());
}
#endif // TRITON_ENABLE_CUDA_CTX_SHARING

cuerr = cudaSetDevice(device_id);
if (cuerr != cudaSuccess) {
return TRITONSERVER_ErrorNew(
Expand Down Expand Up @@ -388,15 +373,13 @@ ModelState::AutoCompleteConfig()

RETURN_IF_ERROR(AutoCompleteConfigHelper(model_path));

if (!IsCudaContextSharingEnabled()) {
cuerr = cudaSetDevice(current_device);
if (cuerr != cudaSuccess) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string("unable to revert CUDA device to GPU ") +
std::to_string(current_device) + " : " + cudaGetErrorString(cuerr))
.c_str());
}
cuerr = cudaSetDevice(current_device);
if (cuerr != cudaSuccess) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string("unable to revert CUDA device to GPU ") +
std::to_string(current_device) + " : " + cudaGetErrorString(cuerr))
.c_str());
}

if (TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_VERBOSE)) {
Expand Down
8 changes: 0 additions & 8 deletions src/tensorrt.cc
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,6 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
DeviceMemoryTracker::TrackThreadMemoryUsage(lusage.get());
}

ScopedRuntimeCudaContext cuda_scope(model_state);

// With each instance we create a ModelInstanceState object and
// associate it with the TRITONBACKEND_ModelInstance.
Expand Down Expand Up @@ -354,11 +353,6 @@ TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
"TRITONBACKEND_ModelInstanceFinalize: delete instance state");
if (!instance_state) {
return nullptr;
}

ScopedRuntimeCudaContext cuda_scope(instance_state->StateForModel());

delete instance_state;

Expand All @@ -383,8 +377,6 @@ TRITONBACKEND_ModelInstanceExecute(
instance, reinterpret_cast<void**>(&instance_state)));
ModelState* model_state = instance_state->StateForModel();

ScopedRuntimeCudaContext cuda_scope(model_state);

// For TensorRT backend, the executing instance may not closely tie to
// TRITONBACKEND_ModelInstance, the instance will be assigned based on
// execution policy.
Expand Down
23 changes: 0 additions & 23 deletions src/tensorrt_model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -90,14 +90,6 @@ TensorRTModel::ParseModelConfig()
}
}

#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
std::string ptr_str = "";
RETURN_IF_ERROR(GetParameter("CUDA_CONTEXT_PTR", ptr_str));
cuda_ctx = static_cast<CUcontext>(StringToPointer(ptr_str));
// cuda_ctx = static_cast<CUcontext>(reinterpret_cast<void*>(ptr_str));
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "Cuda Context pointer is set");
#endif // TRITON_ENABLE_CUDA_CTX_SHARING

return nullptr; // Success
}

Expand Down Expand Up @@ -128,19 +120,4 @@ TensorRTModel::GetCudaStreamPriority()
return cuda_stream_priority;
}

template <>
TRITONSERVER_Error*
TensorRTModel::GetParameter<std::string>(
std::string const& name, std::string& str_value)
{
triton::common::TritonJson::Value parameters;
RETURN_IF_ERROR(model_config_.MemberAsObject("parameters", &parameters));

triton::common::TritonJson::Value value;
RETURN_IF_ERROR(parameters.MemberAsObject(name.c_str(), &value));

value.MemberAsString("string_value", &str_value);
return nullptr;
}

}}} // namespace triton::backend::tensorrt
96 changes: 0 additions & 96 deletions src/tensorrt_model.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,6 @@
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once

#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
#include <cuda.h>
#endif // TRITON_ENABLE_CUDA_CTX_SHARING
#include <sstream>

#include "triton/backend/backend_model.h"

namespace triton { namespace backend { namespace tensorrt {
Expand All @@ -39,14 +34,6 @@ class TensorRTModel : public BackendModel {
TensorRTModel(TRITONBACKEND_Model* triton_model);
virtual ~TensorRTModel() = default;

template <typename T>
TRITONSERVER_Error* GetParameter(std::string const& name, T& value)
{
assert(false);
auto dummy = T();
return dummy;
}

TRITONSERVER_Error* SetTensorRTModelConfig();

TRITONSERVER_Error* ParseModelConfig();
Expand All @@ -66,65 +53,6 @@ class TensorRTModel : public BackendModel {
bool EagerBatching() { return eager_batching_; }
bool BusyWaitEvents() { return busy_wait_events_; }

template <>
TRITONSERVER_Error* GetParameter<std::string>(
std::string const& name, std::string& str_value);

void* StringToPointer(std::string& str)
{
std::stringstream ss;
ss << str;

void* ctx_ptr;
ss >> ctx_ptr;
return ctx_ptr;
}

//! Following functions are related to custom Cuda context (Cuda in Graphics)
//! sharing for gaming use case. Creating a shared contexts reduces context
//! switching overhead and leads to better performance of model execution
//! along side Graphics workload.

bool IsCudaContextSharingEnabled()
{
#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
return cuda_ctx != nullptr;
#else
return false;
#endif // TRITON_ENABLE_CUDA_CTX_SHARING
}

inline TRITONSERVER_Error* PushCudaContext()
{
#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
if (CUDA_SUCCESS != cuCtxPushCurrent(cuda_ctx)) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string("unable to push Cuda context for ") + Name()).c_str());
}
#endif // TRITON_ENABLE_CUDA_CTX_SHARING
return nullptr;
}

inline TRITONSERVER_Error* PopCudaContext()
{
#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
CUcontext oldCtx{};
if (CUDA_SUCCESS != cuCtxPopCurrent(&oldCtx)) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string("unable to pop Cuda context for ") + Name()).c_str());
}
if (oldCtx != cuda_ctx) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
(std::string("popping the wrong Cuda context for ") + Name())
.c_str());
}
#endif // TRITON_ENABLE_CUDA_CTX_SHARING
return nullptr;
}

protected:
common::TritonJson::Value graph_specs_;
Priority priority_;
Expand All @@ -133,30 +61,6 @@ class TensorRTModel : public BackendModel {
bool separate_output_stream_;
bool eager_batching_;
bool busy_wait_events_;
#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
CUcontext cuda_ctx = nullptr;
#endif // TRITON_ENABLE_CUDA_CTX_SHARING
};

struct ScopedRuntimeCudaContext {
ScopedRuntimeCudaContext(TensorRTModel* model_state)
: model_state_(model_state)
{
#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
if (model_state_->IsCudaContextSharingEnabled()) {
THROW_IF_BACKEND_MODEL_ERROR(model_state_->PushCudaContext());
}
#endif // TRITON_ENABLE_CUDA_CTX_SHARING
}
~ScopedRuntimeCudaContext()
{
#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
if (model_state_->IsCudaContextSharingEnabled()) {
THROW_IF_BACKEND_MODEL_ERROR(model_state_->PopCudaContext());
}
#endif // TRITON_ENABLE_CUDA_CTX_SHARING
}
TensorRTModel* model_state_;
};

}}} // namespace triton::backend::tensorrt

0 comments on commit 2b060ca

Please sign in to comment.