From 2b060ca4406e8e87306ba24f05655073fc537dd3 Mon Sep 17 00:00:00 2001 From: Misha Chornyi <99709299+mc-nv@users.noreply.github.com> Date: Thu, 30 Jan 2025 11:37:21 -0800 Subject: [PATCH] Revert "Enable Cuda in Graphics Implementation for TensorRT backend (#100)" (#105) This reverts commit ab13c1073bc624b1dc1b42fcbdc771e73bdc83db. --- CMakeLists.txt | 13 ------ src/instance_state.cc | 25 ++++------- src/model_state.cc | 49 ++++++++-------------- src/tensorrt.cc | 8 ---- src/tensorrt_model.cc | 23 ----------- src/tensorrt_model.h | 96 ------------------------------------------- 6 files changed, 25 insertions(+), 189 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6a8fc9c..682b3fd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,8 +37,6 @@ set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard which feat option(TRITON_ENABLE_GPU "Enable GPU support in backend." ON) option(TRITON_ENABLE_STATS "Include statistics collections in backend." ON) option(TRITON_ENABLE_NVTX "Include nvtx markers collection in backend." OFF) -option(TRITON_ENABLE_CUDA_CTX_SHARING "Enable Cuda context sharing support in backend." OFF) - set(TRITON_TENSORRT_LIB_PATHS "" CACHE PATH "Paths to TensorRT libraries. Multiple paths may be specified by separating them with a semicolon.") set(TRITON_TENSORRT_INCLUDE_PATHS "" CACHE PATH "Paths to TensorRT includes. Multiple paths may be specified by separating them with a semicolon.") @@ -234,17 +232,6 @@ target_link_libraries( CUDA::cudart ) -if(${TRITON_ENABLE_CUDA_CTX_SHARING}) - target_compile_definitions( - triton-tensorrt-backend - PRIVATE TRITON_ENABLE_CUDA_CTX_SHARING - ) - target_link_libraries( - triton-tensorrt-backend - PRIVATE - CUDA::cuda_driver - ) -endif() # # Install diff --git a/src/instance_state.cc b/src/instance_state.cc index 1e0517b..56208a1 100644 --- a/src/instance_state.cc +++ b/src/instance_state.cc @@ -257,9 +257,7 @@ ModelInstanceState::ModelInstanceState( ModelInstanceState::~ModelInstanceState() { - if (!model_state_->IsCudaContextSharingEnabled()) { - cudaSetDevice(DeviceId()); - } + cudaSetDevice(DeviceId()); for (auto& io_binding_infos : io_binding_infos_) { for (auto& io_binding_info : io_binding_infos) { if (!io_binding_info.IsDynamicShapeOutput() && @@ -426,9 +424,7 @@ ModelInstanceState::Run( payload_.reset(new Payload(next_set_, requests, request_count)); SET_TIMESTAMP(payload_->compute_start_ns_); - if (!model_state_->IsCudaContextSharingEnabled()) { - cudaSetDevice(DeviceId()); - } + cudaSetDevice(DeviceId()); #ifdef TRITON_ENABLE_STATS { SET_TIMESTAMP(payload_->compute_start_ns_); @@ -1555,16 +1551,13 @@ ModelInstanceState::EvaluateTensorRTContext( TRITONSERVER_Error* ModelInstanceState::InitStreamsAndEvents() { - if (!model_state_->IsCudaContextSharingEnabled()) { - // Set the device before preparing the context. - auto cuerr = cudaSetDevice(DeviceId()); - if (cuerr != cudaSuccess) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to set device for ") + Name() + ": " + - cudaGetErrorString(cuerr)) - .c_str()); - } + // Set the device before preparing the context. + auto cuerr = cudaSetDevice(DeviceId()); + if (cuerr != cudaSuccess) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, (std::string("unable to set device for ") + + Name() + ": " + cudaGetErrorString(cuerr)) + .c_str()); } // Create CUDA streams associated with the instance diff --git a/src/model_state.cc b/src/model_state.cc index 9d547c1..6127989 100644 --- a/src/model_state.cc +++ b/src/model_state.cc @@ -175,9 +175,7 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model) ModelState::~ModelState() { for (auto& device_engine : device_engines_) { - if (!IsCudaContextSharingEnabled()) { - cudaSetDevice(device_engine.first.first); - } + cudaSetDevice(device_engine.first.first); auto& runtime = device_engine.second.first; auto& engine = device_engine.second.second; // Need to reset explicitly to ensure proper destruction order @@ -211,16 +209,15 @@ ModelState::CreateEngine( // We share the engine (for models that don't have dynamic shapes) and // runtime across instances that have access to the same GPU/NVDLA. if (eit->second.second == nullptr) { - if (!IsCudaContextSharingEnabled()) { - auto cuerr = cudaSetDevice(gpu_device); - if (cuerr != cudaSuccess) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to set device for ") + Name() + ": " + - cudaGetErrorString(cuerr)) - .c_str()); - } + auto cuerr = cudaSetDevice(gpu_device); + if (cuerr != cudaSuccess) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("unable to set device for ") + Name() + ": " + + cudaGetErrorString(cuerr)) + .c_str()); } + const bool new_runtime = (eit->second.first == nullptr); RETURN_IF_ERROR(LoadPlan( model_path, dla_core_id, &eit->second.first, &eit->second.second, @@ -324,18 +321,6 @@ ModelState::AutoCompleteConfig() " to auto-complete config for " + Name()) .c_str())); -#ifdef TRITON_ENABLE_CUDA_CTX_SHARING - // Return failure if Cuda context sharing is enabled and - // if it is a multi GPU setup - if (IsCudaContextSharingEnabled() && device_id != 0) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string( - "Cuda context sharing is not supported on multi-GPU system.")) - .c_str()); - } -#endif // TRITON_ENABLE_CUDA_CTX_SHARING - cuerr = cudaSetDevice(device_id); if (cuerr != cudaSuccess) { return TRITONSERVER_ErrorNew( @@ -388,15 +373,13 @@ ModelState::AutoCompleteConfig() RETURN_IF_ERROR(AutoCompleteConfigHelper(model_path)); - if (!IsCudaContextSharingEnabled()) { - cuerr = cudaSetDevice(current_device); - if (cuerr != cudaSuccess) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to revert CUDA device to GPU ") + - std::to_string(current_device) + " : " + cudaGetErrorString(cuerr)) - .c_str()); - } + cuerr = cudaSetDevice(current_device); + if (cuerr != cudaSuccess) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("unable to revert CUDA device to GPU ") + + std::to_string(current_device) + " : " + cudaGetErrorString(cuerr)) + .c_str()); } if (TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_VERBOSE)) { diff --git a/src/tensorrt.cc b/src/tensorrt.cc index 747b867..2c2d2a4 100644 --- a/src/tensorrt.cc +++ b/src/tensorrt.cc @@ -318,7 +318,6 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) DeviceMemoryTracker::TrackThreadMemoryUsage(lusage.get()); } - ScopedRuntimeCudaContext cuda_scope(model_state); // With each instance we create a ModelInstanceState object and // associate it with the TRITONBACKEND_ModelInstance. @@ -354,11 +353,6 @@ TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance) LOG_MESSAGE( TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelInstanceFinalize: delete instance state"); - if (!instance_state) { - return nullptr; - } - - ScopedRuntimeCudaContext cuda_scope(instance_state->StateForModel()); delete instance_state; @@ -383,8 +377,6 @@ TRITONBACKEND_ModelInstanceExecute( instance, reinterpret_cast(&instance_state))); ModelState* model_state = instance_state->StateForModel(); - ScopedRuntimeCudaContext cuda_scope(model_state); - // For TensorRT backend, the executing instance may not closely tie to // TRITONBACKEND_ModelInstance, the instance will be assigned based on // execution policy. diff --git a/src/tensorrt_model.cc b/src/tensorrt_model.cc index b466bfa..bf2959d 100644 --- a/src/tensorrt_model.cc +++ b/src/tensorrt_model.cc @@ -90,14 +90,6 @@ TensorRTModel::ParseModelConfig() } } -#ifdef TRITON_ENABLE_CUDA_CTX_SHARING - std::string ptr_str = ""; - RETURN_IF_ERROR(GetParameter("CUDA_CONTEXT_PTR", ptr_str)); - cuda_ctx = static_cast(StringToPointer(ptr_str)); - // cuda_ctx = static_cast(reinterpret_cast(ptr_str)); - LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "Cuda Context pointer is set"); -#endif // TRITON_ENABLE_CUDA_CTX_SHARING - return nullptr; // Success } @@ -128,19 +120,4 @@ TensorRTModel::GetCudaStreamPriority() return cuda_stream_priority; } -template <> -TRITONSERVER_Error* -TensorRTModel::GetParameter( - std::string const& name, std::string& str_value) -{ - triton::common::TritonJson::Value parameters; - RETURN_IF_ERROR(model_config_.MemberAsObject("parameters", ¶meters)); - - triton::common::TritonJson::Value value; - RETURN_IF_ERROR(parameters.MemberAsObject(name.c_str(), &value)); - - value.MemberAsString("string_value", &str_value); - return nullptr; -} - }}} // namespace triton::backend::tensorrt diff --git a/src/tensorrt_model.h b/src/tensorrt_model.h index bbc2329..86c67a2 100644 --- a/src/tensorrt_model.h +++ b/src/tensorrt_model.h @@ -25,11 +25,6 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once -#ifdef TRITON_ENABLE_CUDA_CTX_SHARING -#include -#endif // TRITON_ENABLE_CUDA_CTX_SHARING -#include - #include "triton/backend/backend_model.h" namespace triton { namespace backend { namespace tensorrt { @@ -39,14 +34,6 @@ class TensorRTModel : public BackendModel { TensorRTModel(TRITONBACKEND_Model* triton_model); virtual ~TensorRTModel() = default; - template - TRITONSERVER_Error* GetParameter(std::string const& name, T& value) - { - assert(false); - auto dummy = T(); - return dummy; - } - TRITONSERVER_Error* SetTensorRTModelConfig(); TRITONSERVER_Error* ParseModelConfig(); @@ -66,65 +53,6 @@ class TensorRTModel : public BackendModel { bool EagerBatching() { return eager_batching_; } bool BusyWaitEvents() { return busy_wait_events_; } - template <> - TRITONSERVER_Error* GetParameter( - std::string const& name, std::string& str_value); - - void* StringToPointer(std::string& str) - { - std::stringstream ss; - ss << str; - - void* ctx_ptr; - ss >> ctx_ptr; - return ctx_ptr; - } - - //! Following functions are related to custom Cuda context (Cuda in Graphics) - //! sharing for gaming use case. Creating a shared contexts reduces context - //! switching overhead and leads to better performance of model execution - //! along side Graphics workload. - - bool IsCudaContextSharingEnabled() - { -#ifdef TRITON_ENABLE_CUDA_CTX_SHARING - return cuda_ctx != nullptr; -#else - return false; -#endif // TRITON_ENABLE_CUDA_CTX_SHARING - } - - inline TRITONSERVER_Error* PushCudaContext() - { -#ifdef TRITON_ENABLE_CUDA_CTX_SHARING - if (CUDA_SUCCESS != cuCtxPushCurrent(cuda_ctx)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to push Cuda context for ") + Name()).c_str()); - } -#endif // TRITON_ENABLE_CUDA_CTX_SHARING - return nullptr; - } - - inline TRITONSERVER_Error* PopCudaContext() - { -#ifdef TRITON_ENABLE_CUDA_CTX_SHARING - CUcontext oldCtx{}; - if (CUDA_SUCCESS != cuCtxPopCurrent(&oldCtx)) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to pop Cuda context for ") + Name()).c_str()); - } - if (oldCtx != cuda_ctx) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("popping the wrong Cuda context for ") + Name()) - .c_str()); - } -#endif // TRITON_ENABLE_CUDA_CTX_SHARING - return nullptr; - } - protected: common::TritonJson::Value graph_specs_; Priority priority_; @@ -133,30 +61,6 @@ class TensorRTModel : public BackendModel { bool separate_output_stream_; bool eager_batching_; bool busy_wait_events_; -#ifdef TRITON_ENABLE_CUDA_CTX_SHARING - CUcontext cuda_ctx = nullptr; -#endif // TRITON_ENABLE_CUDA_CTX_SHARING -}; - -struct ScopedRuntimeCudaContext { - ScopedRuntimeCudaContext(TensorRTModel* model_state) - : model_state_(model_state) - { -#ifdef TRITON_ENABLE_CUDA_CTX_SHARING - if (model_state_->IsCudaContextSharingEnabled()) { - THROW_IF_BACKEND_MODEL_ERROR(model_state_->PushCudaContext()); - } -#endif // TRITON_ENABLE_CUDA_CTX_SHARING - } - ~ScopedRuntimeCudaContext() - { -#ifdef TRITON_ENABLE_CUDA_CTX_SHARING - if (model_state_->IsCudaContextSharingEnabled()) { - THROW_IF_BACKEND_MODEL_ERROR(model_state_->PopCudaContext()); - } -#endif // TRITON_ENABLE_CUDA_CTX_SHARING - } - TensorRTModel* model_state_; }; }}} // namespace triton::backend::tensorrt