From 134bf3396c6542b88bc1ef1b62469a9e2f012d82 Mon Sep 17 00:00:00 2001 From: Ashish Karale Date: Wed, 28 Aug 2024 08:27:35 +0000 Subject: [PATCH 01/14] Enable CiG support in Tensorrt backend --- CMakeLists.txt | 1 + src/instance_state.cc | 28 ++++++++++++++------- src/model_state.cc | 57 ++++++++++++++++++++++++++----------------- src/tensorrt.cc | 35 ++++++++++++++++++++++++++ src/tensorrt_model.cc | 18 ++++++++++++-- src/tensorrt_model.h | 5 ++++ 6 files changed, 110 insertions(+), 34 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7a70a85..b798d11 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -269,6 +269,7 @@ target_link_libraries( triton-tensorrt-backend PRIVATE CUDA::cudart + CUDA::cuda_driver ) diff --git a/src/instance_state.cc b/src/instance_state.cc index 56208a1..e7113d9 100644 --- a/src/instance_state.cc +++ b/src/instance_state.cc @@ -257,7 +257,10 @@ ModelInstanceState::ModelInstanceState( ModelInstanceState::~ModelInstanceState() { - cudaSetDevice(DeviceId()); + // Set device if CiG is disabled + if (!model_state_->isCiGEnabled()) { + cudaSetDevice(DeviceId()); + } for (auto& io_binding_infos : io_binding_infos_) { for (auto& io_binding_info : io_binding_infos) { if (!io_binding_info.IsDynamicShapeOutput() && @@ -424,7 +427,10 @@ ModelInstanceState::Run( payload_.reset(new Payload(next_set_, requests, request_count)); SET_TIMESTAMP(payload_->compute_start_ns_); - cudaSetDevice(DeviceId()); + // Set device if CiG is disabled + if (!model_state_->isCiGEnabled()) { + cudaSetDevice(DeviceId()); + } #ifdef TRITON_ENABLE_STATS { SET_TIMESTAMP(payload_->compute_start_ns_); @@ -1551,13 +1557,17 @@ ModelInstanceState::EvaluateTensorRTContext( TRITONSERVER_Error* ModelInstanceState::InitStreamsAndEvents() { - // Set the device before preparing the context. - auto cuerr = cudaSetDevice(DeviceId()); - if (cuerr != cudaSuccess) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, (std::string("unable to set device for ") + - Name() + ": " + cudaGetErrorString(cuerr)) - .c_str()); + // Set device if CiG is disabled + if (!model_state_->isCiGEnabled()) { + // Set the device before preparing the context. + auto cuerr = cudaSetDevice(DeviceId()); + if (cuerr != cudaSuccess) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("unable to set device for ") + Name() + ": " + + cudaGetErrorString(cuerr)) + .c_str()); + } } // Create CUDA streams associated with the instance diff --git a/src/model_state.cc b/src/model_state.cc index 6127989..0622a94 100644 --- a/src/model_state.cc +++ b/src/model_state.cc @@ -175,7 +175,10 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model) ModelState::~ModelState() { for (auto& device_engine : device_engines_) { - cudaSetDevice(device_engine.first.first); + // Set device if CiG is disabled + if (!isCiGEnabled()) { + cudaSetDevice(device_engine.first.first); + } auto& runtime = device_engine.second.first; auto& engine = device_engine.second.second; // Need to reset explicitly to ensure proper destruction order @@ -209,15 +212,17 @@ ModelState::CreateEngine( // We share the engine (for models that don't have dynamic shapes) and // runtime across instances that have access to the same GPU/NVDLA. if (eit->second.second == nullptr) { - auto cuerr = cudaSetDevice(gpu_device); - if (cuerr != cudaSuccess) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to set device for ") + Name() + ": " + - cudaGetErrorString(cuerr)) - .c_str()); + // Set device if CiG is disabled + if (!isCiGEnabled()) { + auto cuerr = cudaSetDevice(gpu_device); + if (cuerr != cudaSuccess) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("unable to set device for ") + Name() + ": " + + cudaGetErrorString(cuerr)) + .c_str()); + } } - const bool new_runtime = (eit->second.first == nullptr); RETURN_IF_ERROR(LoadPlan( model_path, dla_core_id, &eit->second.first, &eit->second.second, @@ -321,13 +326,16 @@ ModelState::AutoCompleteConfig() " to auto-complete config for " + Name()) .c_str())); - cuerr = cudaSetDevice(device_id); - if (cuerr != cudaSuccess) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to set CUDA device to GPU ") + - std::to_string(device_id) + " : " + cudaGetErrorString(cuerr)) - .c_str()); + // Set device if CiG is disabled + if (!isCiGEnabled()) { + cuerr = cudaSetDevice(device_id); + if (cuerr != cudaSuccess) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("unable to set CUDA device to GPU ") + + std::to_string(device_id) + " : " + cudaGetErrorString(cuerr)) + .c_str()); + } } std::string artifact_name; @@ -373,13 +381,16 @@ ModelState::AutoCompleteConfig() RETURN_IF_ERROR(AutoCompleteConfigHelper(model_path)); - cuerr = cudaSetDevice(current_device); - if (cuerr != cudaSuccess) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to revert CUDA device to GPU ") + - std::to_string(current_device) + " : " + cudaGetErrorString(cuerr)) - .c_str()); + // Set device if CiG is disabled + if (!isCiGEnabled()) { + cuerr = cudaSetDevice(current_device); + if (cuerr != cudaSuccess) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("unable to revert CUDA device to GPU ") + + std::to_string(current_device) + " : " + cudaGetErrorString(cuerr)) + .c_str()); + } } if (TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_VERBOSE)) { diff --git a/src/tensorrt.cc b/src/tensorrt.cc index 2c2d2a4..1517035 100644 --- a/src/tensorrt.cc +++ b/src/tensorrt.cc @@ -318,6 +318,13 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) DeviceMemoryTracker::TrackThreadMemoryUsage(lusage.get()); } + CUcontext cig_ctx = model_state->GetCiGContext(); + if (cig_ctx != nullptr) { + auto result = cuCtxPushCurrent(cig_ctx); + RETURN_ERROR_IF_FALSE( + result == CUDA_SUCCESS, TRITONSERVER_ERROR_INTERNAL, + std::string("Error while setting cig context")); + } // With each instance we create a ModelInstanceState object and // associate it with the TRITONBACKEND_ModelInstance. @@ -336,6 +343,10 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) instance, ba_array, ba_len)); } + if (cig_ctx != nullptr) { + cuCtxPopCurrent(&cig_ctx); + } + return nullptr; // success } @@ -350,12 +361,24 @@ TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance) ModelInstanceState* instance_state = reinterpret_cast(vstate); + CUcontext cig_ctx = instance_state->StateForModel()->GetCiGContext(); + if (cig_ctx != nullptr) { + auto result = cuCtxPushCurrent(cig_ctx); + RETURN_ERROR_IF_FALSE( + result == CUDA_SUCCESS, TRITONSERVER_ERROR_INTERNAL, + std::string("Error while setting cig context")); + } + LOG_MESSAGE( TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelInstanceFinalize: delete instance state"); delete instance_state; + if (cig_ctx != nullptr) { + cuCtxPopCurrent(&cig_ctx); + } + return nullptr; // success } @@ -377,6 +400,14 @@ TRITONBACKEND_ModelInstanceExecute( instance, reinterpret_cast(&instance_state))); ModelState* model_state = instance_state->StateForModel(); + CUcontext cig_ctx = instance_state->StateForModel()->GetCiGContext(); + if (cig_ctx != nullptr) { + auto result = cuCtxPushCurrent(cig_ctx); + RETURN_ERROR_IF_FALSE( + result == CUDA_SUCCESS, TRITONSERVER_ERROR_INTERNAL, + std::string("Error while setting cig context")); + } + // For TensorRT backend, the executing instance may not closely tie to // TRITONBACKEND_ModelInstance, the instance will be assigned based on // execution policy. @@ -411,6 +442,10 @@ TRITONBACKEND_ModelInstanceExecute( // arrive between when the batch is formed and when batch is executed. semaphore->Acquire(); + if (cig_ctx != nullptr) { + cuCtxPopCurrent(&cig_ctx); + } + return nullptr; // success } diff --git a/src/tensorrt_model.cc b/src/tensorrt_model.cc index bf2959d..7edcec6 100644 --- a/src/tensorrt_model.cc +++ b/src/tensorrt_model.cc @@ -25,6 +25,7 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "tensorrt_model.h" +#include namespace triton { namespace backend { namespace tensorrt { @@ -53,7 +54,7 @@ TensorRTModel::TensorRTModel(TRITONBACKEND_Model* triton_model) : BackendModel(triton_model), priority_(Priority::DEFAULT), use_cuda_graphs_(false), gather_kernel_buffer_threshold_(0), separate_output_stream_(false), eager_batching_(false), - busy_wait_events_(false) + busy_wait_events_(false), cig_ctx_(nullptr) { ParseModelConfig(); } @@ -89,7 +90,20 @@ TensorRTModel::ParseModelConfig() cuda.MemberAsBool("output_copy_stream", &separate_output_stream_)); } } - + triton::common::TritonJson::Value parameters; + if (model_config_.Find("parameters", ¶meters)) { + triton::common::TritonJson::Value value; + std::string ptr_value; + if (parameters.Find("CIG_CONTEXT_PTR", &value)) { + RETURN_IF_ERROR(value.MemberAsString("string_value", &ptr_value)); + std::stringstream ss; + ss << ptr_value; + void* ctx_ptr; + ss >> ctx_ptr; + cig_ctx_ = static_cast(ctx_ptr); + LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "CiG Context pointer is set"); + } + } return nullptr; // Success } diff --git a/src/tensorrt_model.h b/src/tensorrt_model.h index 86c67a2..0da6682 100644 --- a/src/tensorrt_model.h +++ b/src/tensorrt_model.h @@ -26,6 +26,7 @@ #pragma once #include "triton/backend/backend_model.h" +#include namespace triton { namespace backend { namespace tensorrt { @@ -53,6 +54,9 @@ class TensorRTModel : public BackendModel { bool EagerBatching() { return eager_batching_; } bool BusyWaitEvents() { return busy_wait_events_; } + CUcontext GetCiGContext() { return cig_ctx_; } + bool isCiGEnabled() { return cig_ctx_ != nullptr; } + protected: common::TritonJson::Value graph_specs_; Priority priority_; @@ -61,6 +65,7 @@ class TensorRTModel : public BackendModel { bool separate_output_stream_; bool eager_batching_; bool busy_wait_events_; + CUcontext cig_ctx_; }; }}} // namespace triton::backend::tensorrt From 7353671a81f602607a3fbd5d7c1d31ea1c9e7aaf Mon Sep 17 00:00:00 2001 From: Ashish Karale Date: Wed, 28 Aug 2024 09:43:51 +0000 Subject: [PATCH 02/14] Creating scoped runtime context structure for better management of CiG context --- src/tensorrt.cc | 38 ++++-------------------------------- src/tensorrt_model.h | 46 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 34 deletions(-) diff --git a/src/tensorrt.cc b/src/tensorrt.cc index 1517035..fe2ad88 100644 --- a/src/tensorrt.cc +++ b/src/tensorrt.cc @@ -318,13 +318,7 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) DeviceMemoryTracker::TrackThreadMemoryUsage(lusage.get()); } - CUcontext cig_ctx = model_state->GetCiGContext(); - if (cig_ctx != nullptr) { - auto result = cuCtxPushCurrent(cig_ctx); - RETURN_ERROR_IF_FALSE( - result == CUDA_SUCCESS, TRITONSERVER_ERROR_INTERNAL, - std::string("Error while setting cig context")); - } + ScopedRuntimeCiGContext cig_scope(model_state); // With each instance we create a ModelInstanceState object and // associate it with the TRITONBACKEND_ModelInstance. @@ -343,10 +337,6 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) instance, ba_array, ba_len)); } - if (cig_ctx != nullptr) { - cuCtxPopCurrent(&cig_ctx); - } - return nullptr; // success } @@ -361,24 +351,14 @@ TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance) ModelInstanceState* instance_state = reinterpret_cast(vstate); - CUcontext cig_ctx = instance_state->StateForModel()->GetCiGContext(); - if (cig_ctx != nullptr) { - auto result = cuCtxPushCurrent(cig_ctx); - RETURN_ERROR_IF_FALSE( - result == CUDA_SUCCESS, TRITONSERVER_ERROR_INTERNAL, - std::string("Error while setting cig context")); - } - + ScopedRuntimeCiGContext cig_scope(instance_state->StateForModel()); + LOG_MESSAGE( TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelInstanceFinalize: delete instance state"); delete instance_state; - if (cig_ctx != nullptr) { - cuCtxPopCurrent(&cig_ctx); - } - return nullptr; // success } @@ -400,13 +380,7 @@ TRITONBACKEND_ModelInstanceExecute( instance, reinterpret_cast(&instance_state))); ModelState* model_state = instance_state->StateForModel(); - CUcontext cig_ctx = instance_state->StateForModel()->GetCiGContext(); - if (cig_ctx != nullptr) { - auto result = cuCtxPushCurrent(cig_ctx); - RETURN_ERROR_IF_FALSE( - result == CUDA_SUCCESS, TRITONSERVER_ERROR_INTERNAL, - std::string("Error while setting cig context")); - } + ScopedRuntimeCiGContext cig_scope(instance_state->StateForModel()); // For TensorRT backend, the executing instance may not closely tie to // TRITONBACKEND_ModelInstance, the instance will be assigned based on @@ -442,10 +416,6 @@ TRITONBACKEND_ModelInstanceExecute( // arrive between when the batch is formed and when batch is executed. semaphore->Acquire(); - if (cig_ctx != nullptr) { - cuCtxPopCurrent(&cig_ctx); - } - return nullptr; // success } diff --git a/src/tensorrt_model.h b/src/tensorrt_model.h index 0da6682..9ad7f39 100644 --- a/src/tensorrt_model.h +++ b/src/tensorrt_model.h @@ -54,9 +54,38 @@ class TensorRTModel : public BackendModel { bool EagerBatching() { return eager_batching_; } bool BusyWaitEvents() { return busy_wait_events_; } + + //! Following functions are related to CiG (Cuda in Graphics) context sharing for + //! gaming use case. Creating a shared contexts reduces context switching overhead + //! and leads to better performance of model execution along side Graphics workload. CUcontext GetCiGContext() { return cig_ctx_; } bool isCiGEnabled() { return cig_ctx_ != nullptr; } + inline TRITONSERVER_Error* PushCiGContext() + { + if (CUDA_SUCCESS != cuCtxPushCurrent(cig_ctx_)) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("unable to push CiG context for ") + Name()).c_str()); + } + return nullptr; + } + + inline TRITONSERVER_Error* PopCiGContext() + { + CUcontext oldCtx{}; + if (CUDA_SUCCESS != cuCtxPopCurrent(&oldCtx)) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("unable to [pop CiG context for ") + Name()).c_str()); + } + if (oldCtx != cig_ctx_) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("popping the wrong CiG context for ") + Name()).c_str()); + } + } + protected: common::TritonJson::Value graph_specs_; Priority priority_; @@ -68,4 +97,21 @@ class TensorRTModel : public BackendModel { CUcontext cig_ctx_; }; +struct ScopedRuntimeCiGContext { + ScopedRuntimeCiGContext(TensorRTModel* model_state) + : model_state_(model_state) + { + if (model_state_->isCiGEnabled()) { + THROW_IF_BACKEND_MODEL_ERROR(model_state_->PushCiGContext()); + } + } + ~ScopedRuntimeCiGContext() + { + if (model_state_->isCiGEnabled()) { + THROW_IF_BACKEND_MODEL_ERROR(model_state_->PopCiGContext()); + } + } + TensorRTModel* model_state_; +}; + }}} // namespace triton::backend::tensorrt From ed1296deff13331242c8d52695d7bd37df90bb45 Mon Sep 17 00:00:00 2001 From: Ashish Karale Date: Wed, 28 Aug 2024 10:53:20 +0000 Subject: [PATCH 03/14] instance_state null check --- src/tensorrt.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/tensorrt.cc b/src/tensorrt.cc index fe2ad88..aad6f21 100644 --- a/src/tensorrt.cc +++ b/src/tensorrt.cc @@ -350,12 +350,15 @@ TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance) RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate)); ModelInstanceState* instance_state = reinterpret_cast(vstate); - - ScopedRuntimeCiGContext cig_scope(instance_state->StateForModel()); LOG_MESSAGE( TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelInstanceFinalize: delete instance state"); + if (!instance_state) + { + return nullptr; + } + ScopedRuntimeCiGContext cig_scope(instance_state->StateForModel()); delete instance_state; From 05e37867f0efcd36bcd2a7931622cd2826ff404c Mon Sep 17 00:00:00 2001 From: Ashish Karale Date: Thu, 5 Sep 2024 10:13:45 +0000 Subject: [PATCH 04/14] Minor bug fix --- src/tensorrt_model.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tensorrt_model.h b/src/tensorrt_model.h index 9ad7f39..c93bd58 100644 --- a/src/tensorrt_model.h +++ b/src/tensorrt_model.h @@ -84,6 +84,7 @@ class TensorRTModel : public BackendModel { TRITONSERVER_ERROR_INTERNAL, (std::string("popping the wrong CiG context for ") + Name()).c_str()); } + return nullptr; } protected: From 9ae5a091440b5465362011361843d046d8100a74 Mon Sep 17 00:00:00 2001 From: Ashish Karale Date: Thu, 19 Sep 2024 12:18:02 +0530 Subject: [PATCH 05/14] pre-commit fixes --- src/tensorrt.cc | 7 +++---- src/tensorrt_model.cc | 1 + src/tensorrt_model.h | 12 +++++++----- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/tensorrt.cc b/src/tensorrt.cc index aad6f21..1bd0266 100644 --- a/src/tensorrt.cc +++ b/src/tensorrt.cc @@ -350,13 +350,12 @@ TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance) RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate)); ModelInstanceState* instance_state = reinterpret_cast(vstate); - + LOG_MESSAGE( TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelInstanceFinalize: delete instance state"); - if (!instance_state) - { - return nullptr; + if (!instance_state) { + return nullptr; } ScopedRuntimeCiGContext cig_scope(instance_state->StateForModel()); diff --git a/src/tensorrt_model.cc b/src/tensorrt_model.cc index 7edcec6..8285189 100644 --- a/src/tensorrt_model.cc +++ b/src/tensorrt_model.cc @@ -25,6 +25,7 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "tensorrt_model.h" + #include namespace triton { namespace backend { namespace tensorrt { diff --git a/src/tensorrt_model.h b/src/tensorrt_model.h index c93bd58..708a51a 100644 --- a/src/tensorrt_model.h +++ b/src/tensorrt_model.h @@ -25,9 +25,10 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once -#include "triton/backend/backend_model.h" #include +#include "triton/backend/backend_model.h" + namespace triton { namespace backend { namespace tensorrt { class TensorRTModel : public BackendModel { @@ -55,9 +56,10 @@ class TensorRTModel : public BackendModel { bool BusyWaitEvents() { return busy_wait_events_; } - //! Following functions are related to CiG (Cuda in Graphics) context sharing for - //! gaming use case. Creating a shared contexts reduces context switching overhead - //! and leads to better performance of model execution along side Graphics workload. + //! Following functions are related to CiG (Cuda in Graphics) context sharing + //! for gaming use case. Creating a shared contexts reduces context switching + //! overhead and leads to better performance of model execution along side + //! Graphics workload. CUcontext GetCiGContext() { return cig_ctx_; } bool isCiGEnabled() { return cig_ctx_ != nullptr; } @@ -107,7 +109,7 @@ struct ScopedRuntimeCiGContext { } } ~ScopedRuntimeCiGContext() - { + { if (model_state_->isCiGEnabled()) { THROW_IF_BACKEND_MODEL_ERROR(model_state_->PopCiGContext()); } From 89ab580a4d7571deacd16298a607abcbb5b93911 Mon Sep 17 00:00:00 2001 From: Ashish Karale Date: Thu, 19 Sep 2024 08:30:00 +0000 Subject: [PATCH 06/14] Added new cmake flag TRITON_ENABLE_CIG to make the CiG support build conditional --- CMakeLists.txt | 14 +++++++++++++- src/instance_state.cc | 15 ++++++++++++--- src/model_state.cc | 21 +++++++++++++++++---- src/tensorrt.cc | 8 +++++++- src/tensorrt_model.cc | 9 ++++++++- src/tensorrt_model.h | 10 +++++++++- 6 files changed, 66 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b798d11..c88248d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,6 +37,8 @@ set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard which feat option(TRITON_ENABLE_GPU "Enable GPU support in backend." ON) option(TRITON_ENABLE_STATS "Include statistics collections in backend." ON) option(TRITON_ENABLE_NVTX "Include nvtx markers collection in backend." OFF) +option(TRITON_ENABLE_CIG "Enable Cuda in Graphics (CiG) support in backend." OFF) + set(TRITON_TENSORRT_LIB_PATHS "" CACHE PATH "Paths to TensorRT libraries. Multiple paths may be specified by separating them with a semicolon.") set(TRITON_TENSORRT_INCLUDE_PATHS "" CACHE PATH "Paths to TensorRT includes. Multiple paths may be specified by separating them with a semicolon.") @@ -269,9 +271,19 @@ target_link_libraries( triton-tensorrt-backend PRIVATE CUDA::cudart - CUDA::cuda_driver ) +if(${TRITON_ENABLE_CIG}) + target_compile_definitions( + triton-tensorrt-backend + PRIVATE TRITON_ENABLE_CIG + ) + target_link_libraries( + triton-tensorrt-backend + PRIVATE + CUDA::cuda_driver + ) +endif() # # Install diff --git a/src/instance_state.cc b/src/instance_state.cc index e7113d9..4a4fbc1 100644 --- a/src/instance_state.cc +++ b/src/instance_state.cc @@ -257,8 +257,11 @@ ModelInstanceState::ModelInstanceState( ModelInstanceState::~ModelInstanceState() { +#ifdef TRITON_ENABLE_CIG // Set device if CiG is disabled - if (!model_state_->isCiGEnabled()) { + if (!model_state_->isCiGEnabled()) +#endif //TRITON_ENABLE_CIG + { cudaSetDevice(DeviceId()); } for (auto& io_binding_infos : io_binding_infos_) { @@ -427,8 +430,11 @@ ModelInstanceState::Run( payload_.reset(new Payload(next_set_, requests, request_count)); SET_TIMESTAMP(payload_->compute_start_ns_); +#ifdef TRITON_ENABLE_CIG // Set device if CiG is disabled - if (!model_state_->isCiGEnabled()) { + if (!model_state_->isCiGEnabled()) +#endif //TRITON_ENABLE_CIG + { cudaSetDevice(DeviceId()); } #ifdef TRITON_ENABLE_STATS @@ -1557,8 +1563,11 @@ ModelInstanceState::EvaluateTensorRTContext( TRITONSERVER_Error* ModelInstanceState::InitStreamsAndEvents() { +#ifdef TRITON_ENABLE_CIG // Set device if CiG is disabled - if (!model_state_->isCiGEnabled()) { + if (!model_state_->isCiGEnabled()) +#endif //TRITON_ENABLE_CIG + { // Set the device before preparing the context. auto cuerr = cudaSetDevice(DeviceId()); if (cuerr != cudaSuccess) { diff --git a/src/model_state.cc b/src/model_state.cc index 0622a94..8b8b5d2 100644 --- a/src/model_state.cc +++ b/src/model_state.cc @@ -175,8 +175,11 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model) ModelState::~ModelState() { for (auto& device_engine : device_engines_) { +#ifdef TRITON_ENABLE_CIG // Set device if CiG is disabled - if (!isCiGEnabled()) { + if (!isCiGEnabled()) +#endif //TRITON_ENABLE_CIG + { cudaSetDevice(device_engine.first.first); } auto& runtime = device_engine.second.first; @@ -212,8 +215,12 @@ ModelState::CreateEngine( // We share the engine (for models that don't have dynamic shapes) and // runtime across instances that have access to the same GPU/NVDLA. if (eit->second.second == nullptr) { + +#ifdef TRITON_ENABLE_CIG // Set device if CiG is disabled - if (!isCiGEnabled()) { + if (!isCiGEnabled()) +#endif //TRITON_ENABLE_CIG + { auto cuerr = cudaSetDevice(gpu_device); if (cuerr != cudaSuccess) { return TRITONSERVER_ErrorNew( @@ -326,8 +333,11 @@ ModelState::AutoCompleteConfig() " to auto-complete config for " + Name()) .c_str())); +#ifdef TRITON_ENABLE_CIG // Set device if CiG is disabled - if (!isCiGEnabled()) { + if (!isCiGEnabled()) +#endif //TRITON_ENABLE_CIG + { cuerr = cudaSetDevice(device_id); if (cuerr != cudaSuccess) { return TRITONSERVER_ErrorNew( @@ -381,8 +391,11 @@ ModelState::AutoCompleteConfig() RETURN_IF_ERROR(AutoCompleteConfigHelper(model_path)); +#ifdef TRITON_ENABLE_CIG // Set device if CiG is disabled - if (!isCiGEnabled()) { + if (!isCiGEnabled()) +#endif //TRITON_ENABLE_CIG + { cuerr = cudaSetDevice(current_device); if (cuerr != cudaSuccess) { return TRITONSERVER_ErrorNew( diff --git a/src/tensorrt.cc b/src/tensorrt.cc index 1bd0266..6476313 100644 --- a/src/tensorrt.cc +++ b/src/tensorrt.cc @@ -318,7 +318,9 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) DeviceMemoryTracker::TrackThreadMemoryUsage(lusage.get()); } +#ifdef TRITON_ENABLE_CIG ScopedRuntimeCiGContext cig_scope(model_state); +#endif //TRITON_ENABLE_CIG // With each instance we create a ModelInstanceState object and // associate it with the TRITONBACKEND_ModelInstance. @@ -357,7 +359,9 @@ TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance) if (!instance_state) { return nullptr; } +#ifdef TRITON_ENABLE_CIG ScopedRuntimeCiGContext cig_scope(instance_state->StateForModel()); +#endif //TRITON_ENABLE_CIG delete instance_state; @@ -382,7 +386,9 @@ TRITONBACKEND_ModelInstanceExecute( instance, reinterpret_cast(&instance_state))); ModelState* model_state = instance_state->StateForModel(); - ScopedRuntimeCiGContext cig_scope(instance_state->StateForModel()); +#ifdef TRITON_ENABLE_CIG + ScopedRuntimeCiGContext cig_scope(model_state); +#endif //TRITON_ENABLE_CIG // For TensorRT backend, the executing instance may not closely tie to // TRITONBACKEND_ModelInstance, the instance will be assigned based on diff --git a/src/tensorrt_model.cc b/src/tensorrt_model.cc index 8285189..71259e9 100644 --- a/src/tensorrt_model.cc +++ b/src/tensorrt_model.cc @@ -55,7 +55,10 @@ TensorRTModel::TensorRTModel(TRITONBACKEND_Model* triton_model) : BackendModel(triton_model), priority_(Priority::DEFAULT), use_cuda_graphs_(false), gather_kernel_buffer_threshold_(0), separate_output_stream_(false), eager_batching_(false), - busy_wait_events_(false), cig_ctx_(nullptr) + busy_wait_events_(false) +#ifdef TRITON_ENABLE_CIG + ,cig_ctx_(nullptr) +#endif // TRITON_ENABLE_CIG { ParseModelConfig(); } @@ -91,6 +94,8 @@ TensorRTModel::ParseModelConfig() cuda.MemberAsBool("output_copy_stream", &separate_output_stream_)); } } + +#ifdef TRITON_ENABLE_CIG triton::common::TritonJson::Value parameters; if (model_config_.Find("parameters", ¶meters)) { triton::common::TritonJson::Value value; @@ -105,6 +110,8 @@ TensorRTModel::ParseModelConfig() LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "CiG Context pointer is set"); } } +#endif //TRITON_ENABLE_CIG + return nullptr; // Success } diff --git a/src/tensorrt_model.h b/src/tensorrt_model.h index 708a51a..27c1f2d 100644 --- a/src/tensorrt_model.h +++ b/src/tensorrt_model.h @@ -25,7 +25,9 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once +#ifdef TRITON_ENABLE_CIG #include +#endif //TRITON_ENABLE_CIG #include "triton/backend/backend_model.h" @@ -55,7 +57,7 @@ class TensorRTModel : public BackendModel { bool EagerBatching() { return eager_batching_; } bool BusyWaitEvents() { return busy_wait_events_; } - +#ifdef TRITON_ENABLE_CIG //! Following functions are related to CiG (Cuda in Graphics) context sharing //! for gaming use case. Creating a shared contexts reduces context switching //! overhead and leads to better performance of model execution along side @@ -88,6 +90,7 @@ class TensorRTModel : public BackendModel { } return nullptr; } +#endif //TRITON_ENABLE_CIG protected: common::TritonJson::Value graph_specs_; @@ -97,9 +100,13 @@ class TensorRTModel : public BackendModel { bool separate_output_stream_; bool eager_batching_; bool busy_wait_events_; +#ifdef TRITON_ENABLE_CIG CUcontext cig_ctx_; +#endif //TRITON_ENABLE_CIG + }; +#ifdef TRITON_ENABLE_CIG struct ScopedRuntimeCiGContext { ScopedRuntimeCiGContext(TensorRTModel* model_state) : model_state_(model_state) @@ -116,5 +123,6 @@ struct ScopedRuntimeCiGContext { } TensorRTModel* model_state_; }; +#endif //TRITON_ENABLE_CIG }}} // namespace triton::backend::tensorrt From b624b98bc74782cd19e3ea85c9680baea6ae2813 Mon Sep 17 00:00:00 2001 From: Ashish Karale Date: Tue, 24 Sep 2024 11:53:35 +0530 Subject: [PATCH 07/14] pre-commit fixes --- src/instance_state.cc | 6 +++--- src/model_state.cc | 17 ++++++++--------- src/tensorrt.cc | 6 +++--- src/tensorrt_model.cc | 5 +++-- src/tensorrt_model.h | 9 ++++----- 5 files changed, 21 insertions(+), 22 deletions(-) diff --git a/src/instance_state.cc b/src/instance_state.cc index 4a4fbc1..9a735bc 100644 --- a/src/instance_state.cc +++ b/src/instance_state.cc @@ -260,7 +260,7 @@ ModelInstanceState::~ModelInstanceState() #ifdef TRITON_ENABLE_CIG // Set device if CiG is disabled if (!model_state_->isCiGEnabled()) -#endif //TRITON_ENABLE_CIG +#endif // TRITON_ENABLE_CIG { cudaSetDevice(DeviceId()); } @@ -433,7 +433,7 @@ ModelInstanceState::Run( #ifdef TRITON_ENABLE_CIG // Set device if CiG is disabled if (!model_state_->isCiGEnabled()) -#endif //TRITON_ENABLE_CIG +#endif // TRITON_ENABLE_CIG { cudaSetDevice(DeviceId()); } @@ -1566,7 +1566,7 @@ ModelInstanceState::InitStreamsAndEvents() #ifdef TRITON_ENABLE_CIG // Set device if CiG is disabled if (!model_state_->isCiGEnabled()) -#endif //TRITON_ENABLE_CIG +#endif // TRITON_ENABLE_CIG { // Set the device before preparing the context. auto cuerr = cudaSetDevice(DeviceId()); diff --git a/src/model_state.cc b/src/model_state.cc index 8b8b5d2..a2ab925 100644 --- a/src/model_state.cc +++ b/src/model_state.cc @@ -177,8 +177,8 @@ ModelState::~ModelState() for (auto& device_engine : device_engines_) { #ifdef TRITON_ENABLE_CIG // Set device if CiG is disabled - if (!isCiGEnabled()) -#endif //TRITON_ENABLE_CIG + if (!isCiGEnabled()) +#endif // TRITON_ENABLE_CIG { cudaSetDevice(device_engine.first.first); } @@ -215,11 +215,10 @@ ModelState::CreateEngine( // We share the engine (for models that don't have dynamic shapes) and // runtime across instances that have access to the same GPU/NVDLA. if (eit->second.second == nullptr) { - #ifdef TRITON_ENABLE_CIG // Set device if CiG is disabled - if (!isCiGEnabled()) -#endif //TRITON_ENABLE_CIG + if (!isCiGEnabled()) +#endif // TRITON_ENABLE_CIG { auto cuerr = cudaSetDevice(gpu_device); if (cuerr != cudaSuccess) { @@ -335,8 +334,8 @@ ModelState::AutoCompleteConfig() #ifdef TRITON_ENABLE_CIG // Set device if CiG is disabled - if (!isCiGEnabled()) -#endif //TRITON_ENABLE_CIG + if (!isCiGEnabled()) +#endif // TRITON_ENABLE_CIG { cuerr = cudaSetDevice(device_id); if (cuerr != cudaSuccess) { @@ -393,8 +392,8 @@ ModelState::AutoCompleteConfig() #ifdef TRITON_ENABLE_CIG // Set device if CiG is disabled - if (!isCiGEnabled()) -#endif //TRITON_ENABLE_CIG + if (!isCiGEnabled()) +#endif // TRITON_ENABLE_CIG { cuerr = cudaSetDevice(current_device); if (cuerr != cudaSuccess) { diff --git a/src/tensorrt.cc b/src/tensorrt.cc index 6476313..d98a466 100644 --- a/src/tensorrt.cc +++ b/src/tensorrt.cc @@ -320,7 +320,7 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) #ifdef TRITON_ENABLE_CIG ScopedRuntimeCiGContext cig_scope(model_state); -#endif //TRITON_ENABLE_CIG +#endif // TRITON_ENABLE_CIG // With each instance we create a ModelInstanceState object and // associate it with the TRITONBACKEND_ModelInstance. @@ -361,7 +361,7 @@ TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance) } #ifdef TRITON_ENABLE_CIG ScopedRuntimeCiGContext cig_scope(instance_state->StateForModel()); -#endif //TRITON_ENABLE_CIG +#endif // TRITON_ENABLE_CIG delete instance_state; @@ -388,7 +388,7 @@ TRITONBACKEND_ModelInstanceExecute( #ifdef TRITON_ENABLE_CIG ScopedRuntimeCiGContext cig_scope(model_state); -#endif //TRITON_ENABLE_CIG +#endif // TRITON_ENABLE_CIG // For TensorRT backend, the executing instance may not closely tie to // TRITONBACKEND_ModelInstance, the instance will be assigned based on diff --git a/src/tensorrt_model.cc b/src/tensorrt_model.cc index 71259e9..9339dcb 100644 --- a/src/tensorrt_model.cc +++ b/src/tensorrt_model.cc @@ -57,7 +57,8 @@ TensorRTModel::TensorRTModel(TRITONBACKEND_Model* triton_model) separate_output_stream_(false), eager_batching_(false), busy_wait_events_(false) #ifdef TRITON_ENABLE_CIG - ,cig_ctx_(nullptr) + , + cig_ctx_(nullptr) #endif // TRITON_ENABLE_CIG { ParseModelConfig(); @@ -110,7 +111,7 @@ TensorRTModel::ParseModelConfig() LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "CiG Context pointer is set"); } } -#endif //TRITON_ENABLE_CIG +#endif // TRITON_ENABLE_CIG return nullptr; // Success } diff --git a/src/tensorrt_model.h b/src/tensorrt_model.h index 27c1f2d..1e40e61 100644 --- a/src/tensorrt_model.h +++ b/src/tensorrt_model.h @@ -27,7 +27,7 @@ #ifdef TRITON_ENABLE_CIG #include -#endif //TRITON_ENABLE_CIG +#endif // TRITON_ENABLE_CIG #include "triton/backend/backend_model.h" @@ -90,7 +90,7 @@ class TensorRTModel : public BackendModel { } return nullptr; } -#endif //TRITON_ENABLE_CIG +#endif // TRITON_ENABLE_CIG protected: common::TritonJson::Value graph_specs_; @@ -102,8 +102,7 @@ class TensorRTModel : public BackendModel { bool busy_wait_events_; #ifdef TRITON_ENABLE_CIG CUcontext cig_ctx_; -#endif //TRITON_ENABLE_CIG - +#endif // TRITON_ENABLE_CIG }; #ifdef TRITON_ENABLE_CIG @@ -123,6 +122,6 @@ struct ScopedRuntimeCiGContext { } TensorRTModel* model_state_; }; -#endif //TRITON_ENABLE_CIG +#endif // TRITON_ENABLE_CIG }}} // namespace triton::backend::tensorrt From 1f1ae7e750dc41c8fa6cfef031ca7775129f37cb Mon Sep 17 00:00:00 2001 From: Ashish Karale Date: Wed, 30 Oct 2024 03:40:15 -0700 Subject: [PATCH 08/14] CiG->Cuda. Making the changes more generic to cuda context sharing + hidden ifdefs --- CMakeLists.txt | 6 ++-- src/instance_state.cc | 18 ++--------- src/model_state.cc | 24 +++----------- src/tensorrt.cc | 13 +++----- src/tensorrt_model.cc | 21 ++++-------- src/tensorrt_model.h | 75 ++++++++++++++++++++++++++++--------------- 6 files changed, 69 insertions(+), 88 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c88248d..cd136cc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,7 +37,7 @@ set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard which feat option(TRITON_ENABLE_GPU "Enable GPU support in backend." ON) option(TRITON_ENABLE_STATS "Include statistics collections in backend." ON) option(TRITON_ENABLE_NVTX "Include nvtx markers collection in backend." OFF) -option(TRITON_ENABLE_CIG "Enable Cuda in Graphics (CiG) support in backend." OFF) +option(TRITON_ENABLE_CUDA_CTX_SHARING "Enable Cuda context sharing support in backend." OFF) set(TRITON_TENSORRT_LIB_PATHS "" CACHE PATH "Paths to TensorRT libraries. Multiple paths may be specified by separating them with a semicolon.") set(TRITON_TENSORRT_INCLUDE_PATHS "" CACHE PATH "Paths to TensorRT includes. Multiple paths may be specified by separating them with a semicolon.") @@ -273,10 +273,10 @@ target_link_libraries( CUDA::cudart ) -if(${TRITON_ENABLE_CIG}) +if(${TRITON_ENABLE_CUDA_CTX_SHARING}) target_compile_definitions( triton-tensorrt-backend - PRIVATE TRITON_ENABLE_CIG + PRIVATE TRITON_ENABLE_CUDA_CTX_SHARING ) target_link_libraries( triton-tensorrt-backend diff --git a/src/instance_state.cc b/src/instance_state.cc index 9a735bc..06b38b8 100644 --- a/src/instance_state.cc +++ b/src/instance_state.cc @@ -257,11 +257,7 @@ ModelInstanceState::ModelInstanceState( ModelInstanceState::~ModelInstanceState() { -#ifdef TRITON_ENABLE_CIG - // Set device if CiG is disabled - if (!model_state_->isCiGEnabled()) -#endif // TRITON_ENABLE_CIG - { + if (!model_state_->isCudaContextSharingEnabled()) { cudaSetDevice(DeviceId()); } for (auto& io_binding_infos : io_binding_infos_) { @@ -430,11 +426,7 @@ ModelInstanceState::Run( payload_.reset(new Payload(next_set_, requests, request_count)); SET_TIMESTAMP(payload_->compute_start_ns_); -#ifdef TRITON_ENABLE_CIG - // Set device if CiG is disabled - if (!model_state_->isCiGEnabled()) -#endif // TRITON_ENABLE_CIG - { + if (!model_state_->isCudaContextSharingEnabled()) { cudaSetDevice(DeviceId()); } #ifdef TRITON_ENABLE_STATS @@ -1563,11 +1555,7 @@ ModelInstanceState::EvaluateTensorRTContext( TRITONSERVER_Error* ModelInstanceState::InitStreamsAndEvents() { -#ifdef TRITON_ENABLE_CIG - // Set device if CiG is disabled - if (!model_state_->isCiGEnabled()) -#endif // TRITON_ENABLE_CIG - { + if (!model_state_->isCudaContextSharingEnabled()) { // Set the device before preparing the context. auto cuerr = cudaSetDevice(DeviceId()); if (cuerr != cudaSuccess) { diff --git a/src/model_state.cc b/src/model_state.cc index a2ab925..5912725 100644 --- a/src/model_state.cc +++ b/src/model_state.cc @@ -175,11 +175,7 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model) ModelState::~ModelState() { for (auto& device_engine : device_engines_) { -#ifdef TRITON_ENABLE_CIG - // Set device if CiG is disabled - if (!isCiGEnabled()) -#endif // TRITON_ENABLE_CIG - { + if (!isCudaContextSharingEnabled()) { cudaSetDevice(device_engine.first.first); } auto& runtime = device_engine.second.first; @@ -215,11 +211,7 @@ ModelState::CreateEngine( // We share the engine (for models that don't have dynamic shapes) and // runtime across instances that have access to the same GPU/NVDLA. if (eit->second.second == nullptr) { -#ifdef TRITON_ENABLE_CIG - // Set device if CiG is disabled - if (!isCiGEnabled()) -#endif // TRITON_ENABLE_CIG - { + if (!isCudaContextSharingEnabled()) { auto cuerr = cudaSetDevice(gpu_device); if (cuerr != cudaSuccess) { return TRITONSERVER_ErrorNew( @@ -332,11 +324,7 @@ ModelState::AutoCompleteConfig() " to auto-complete config for " + Name()) .c_str())); -#ifdef TRITON_ENABLE_CIG - // Set device if CiG is disabled - if (!isCiGEnabled()) -#endif // TRITON_ENABLE_CIG - { + if (!isCudaContextSharingEnabled()) { cuerr = cudaSetDevice(device_id); if (cuerr != cudaSuccess) { return TRITONSERVER_ErrorNew( @@ -390,11 +378,7 @@ ModelState::AutoCompleteConfig() RETURN_IF_ERROR(AutoCompleteConfigHelper(model_path)); -#ifdef TRITON_ENABLE_CIG - // Set device if CiG is disabled - if (!isCiGEnabled()) -#endif // TRITON_ENABLE_CIG - { + if (!isCudaContextSharingEnabled()) { cuerr = cudaSetDevice(current_device); if (cuerr != cudaSuccess) { return TRITONSERVER_ErrorNew( diff --git a/src/tensorrt.cc b/src/tensorrt.cc index d98a466..cdc29b3 100644 --- a/src/tensorrt.cc +++ b/src/tensorrt.cc @@ -318,9 +318,7 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) DeviceMemoryTracker::TrackThreadMemoryUsage(lusage.get()); } -#ifdef TRITON_ENABLE_CIG - ScopedRuntimeCiGContext cig_scope(model_state); -#endif // TRITON_ENABLE_CIG + ScopedRuntimeCudaContext cig_scope(model_state); // With each instance we create a ModelInstanceState object and // associate it with the TRITONBACKEND_ModelInstance. @@ -359,9 +357,8 @@ TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance) if (!instance_state) { return nullptr; } -#ifdef TRITON_ENABLE_CIG - ScopedRuntimeCiGContext cig_scope(instance_state->StateForModel()); -#endif // TRITON_ENABLE_CIG + + ScopedRuntimeCudaContext cig_scope(instance_state->StateForModel()); delete instance_state; @@ -386,9 +383,7 @@ TRITONBACKEND_ModelInstanceExecute( instance, reinterpret_cast(&instance_state))); ModelState* model_state = instance_state->StateForModel(); -#ifdef TRITON_ENABLE_CIG - ScopedRuntimeCiGContext cig_scope(model_state); -#endif // TRITON_ENABLE_CIG + ScopedRuntimeCudaContext cig_scope(model_state); // For TensorRT backend, the executing instance may not closely tie to // TRITONBACKEND_ModelInstance, the instance will be assigned based on diff --git a/src/tensorrt_model.cc b/src/tensorrt_model.cc index 9339dcb..05b7995 100644 --- a/src/tensorrt_model.cc +++ b/src/tensorrt_model.cc @@ -26,8 +26,6 @@ #include "tensorrt_model.h" -#include - namespace triton { namespace backend { namespace tensorrt { TensorRTModel::Priority @@ -56,10 +54,6 @@ TensorRTModel::TensorRTModel(TRITONBACKEND_Model* triton_model) use_cuda_graphs_(false), gather_kernel_buffer_threshold_(0), separate_output_stream_(false), eager_batching_(false), busy_wait_events_(false) -#ifdef TRITON_ENABLE_CIG - , - cig_ctx_(nullptr) -#endif // TRITON_ENABLE_CIG { ParseModelConfig(); } @@ -96,22 +90,19 @@ TensorRTModel::ParseModelConfig() } } -#ifdef TRITON_ENABLE_CIG +// TODO Ashish +#ifdef TRITON_ENABLE_CUDA_CTX_SHARING triton::common::TritonJson::Value parameters; if (model_config_.Find("parameters", ¶meters)) { triton::common::TritonJson::Value value; std::string ptr_value; - if (parameters.Find("CIG_CONTEXT_PTR", &value)) { + if (parameters.Find("CUDA_CONTEXT_PTR", &value)) { RETURN_IF_ERROR(value.MemberAsString("string_value", &ptr_value)); - std::stringstream ss; - ss << ptr_value; - void* ctx_ptr; - ss >> ctx_ptr; - cig_ctx_ = static_cast(ctx_ptr); - LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "CiG Context pointer is set"); + cuda_ctx = static_cast(StringToPointer(ptr_value)); + LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "Cuda Context pointer is set"); } } -#endif // TRITON_ENABLE_CIG +#endif // TRITON_ENABLE_CUDA_CTX_SHARING return nullptr; // Success } diff --git a/src/tensorrt_model.h b/src/tensorrt_model.h index 1e40e61..2de730e 100644 --- a/src/tensorrt_model.h +++ b/src/tensorrt_model.h @@ -25,9 +25,10 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once -#ifdef TRITON_ENABLE_CIG +#ifdef TRITON_ENABLE_CUDA_CTX_SHARING #include -#endif // TRITON_ENABLE_CIG +#endif // TRITON_ENABLE_CUDA_CTX_SHARING +#include #include "triton/backend/backend_model.h" @@ -57,40 +58,60 @@ class TensorRTModel : public BackendModel { bool EagerBatching() { return eager_batching_; } bool BusyWaitEvents() { return busy_wait_events_; } -#ifdef TRITON_ENABLE_CIG - //! Following functions are related to CiG (Cuda in Graphics) context sharing + void* StringToPointer(std::string& str) + { + std::stringstream ss; + ss << str; + + void* ctx_ptr; + ss >> ctx_ptr; + return ctx_ptr; + } + + //! Following functions are related to Cuda (Cuda in Graphics) context sharing //! for gaming use case. Creating a shared contexts reduces context switching //! overhead and leads to better performance of model execution along side //! Graphics workload. - CUcontext GetCiGContext() { return cig_ctx_; } - bool isCiGEnabled() { return cig_ctx_ != nullptr; } - inline TRITONSERVER_Error* PushCiGContext() + bool isCudaContextSharingEnabled() + { +#ifdef TRITON_ENABLE_CUDA_CTX_SHARING + return cuda_ctx != nullptr; +#else + return false; +#endif // TRITON_ENABLE_CUDA_CTX_SHARING + } + + inline TRITONSERVER_Error* PushCudaContext() { - if (CUDA_SUCCESS != cuCtxPushCurrent(cig_ctx_)) { +#ifdef TRITON_ENABLE_CUDA_CTX_SHARING + if (CUDA_SUCCESS != cuCtxPushCurrent(cuda_ctx)) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to push CiG context for ") + Name()).c_str()); + (std::string("unable to push Cuda context for ") + Name()).c_str()); } +#endif // TRITON_ENABLE_CUDA_CTX_SHARING return nullptr; } - inline TRITONSERVER_Error* PopCiGContext() + inline TRITONSERVER_Error* PopCudaContext() { +#ifdef TRITON_ENABLE_CUDA_CTX_SHARING CUcontext oldCtx{}; if (CUDA_SUCCESS != cuCtxPopCurrent(&oldCtx)) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to [pop CiG context for ") + Name()).c_str()); + (std::string("unable to pop Cuda context for ") + Name()).c_str()); } - if (oldCtx != cig_ctx_) { + if (oldCtx != cuda_ctx) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, - (std::string("popping the wrong CiG context for ") + Name()).c_str()); + (std::string("popping the wrong Cuda context for ") + Name()) + .c_str()); } +#endif // TRITON_ENABLE_CUDA_CTX_SHARING return nullptr; } -#endif // TRITON_ENABLE_CIG protected: common::TritonJson::Value graph_specs_; @@ -100,28 +121,30 @@ class TensorRTModel : public BackendModel { bool separate_output_stream_; bool eager_batching_; bool busy_wait_events_; -#ifdef TRITON_ENABLE_CIG - CUcontext cig_ctx_; -#endif // TRITON_ENABLE_CIG +#ifdef TRITON_ENABLE_CUDA_CTX_SHARING + CUcontext cuda_ctx = nullptr; +#endif // TRITON_ENABLE_CUDA_CTX_SHARING }; -#ifdef TRITON_ENABLE_CIG -struct ScopedRuntimeCiGContext { - ScopedRuntimeCiGContext(TensorRTModel* model_state) +struct ScopedRuntimeCudaContext { + ScopedRuntimeCudaContext(TensorRTModel* model_state) : model_state_(model_state) { - if (model_state_->isCiGEnabled()) { - THROW_IF_BACKEND_MODEL_ERROR(model_state_->PushCiGContext()); +#ifdef TRITON_ENABLE_CUDA_CTX_SHARING + if (model_state_->isCudaContextSharingEnabled()) { + THROW_IF_BACKEND_MODEL_ERROR(model_state_->PushCudaContext()); } +#endif // TRITON_ENABLE_CUDA_CTX_SHARING } - ~ScopedRuntimeCiGContext() + ~ScopedRuntimeCudaContext() { - if (model_state_->isCiGEnabled()) { - THROW_IF_BACKEND_MODEL_ERROR(model_state_->PopCiGContext()); +#ifdef TRITON_ENABLE_CUDA_CTX_SHARING + if (model_state_->isCudaContextSharingEnabled()) { + THROW_IF_BACKEND_MODEL_ERROR(model_state_->PopCudaContext()); } +#endif // TRITON_ENABLE_CUDA_CTX_SHARING } TensorRTModel* model_state_; }; -#endif // TRITON_ENABLE_CIG }}} // namespace triton::backend::tensorrt From b42a8d6538a6629bacb937f59c7e86345cf74cba Mon Sep 17 00:00:00 2001 From: Ashish Karale Date: Wed, 30 Oct 2024 03:47:23 -0700 Subject: [PATCH 09/14] remove todo --- src/tensorrt_model.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/tensorrt_model.cc b/src/tensorrt_model.cc index 05b7995..42fdc39 100644 --- a/src/tensorrt_model.cc +++ b/src/tensorrt_model.cc @@ -90,7 +90,6 @@ TensorRTModel::ParseModelConfig() } } -// TODO Ashish #ifdef TRITON_ENABLE_CUDA_CTX_SHARING triton::common::TritonJson::Value parameters; if (model_config_.Find("parameters", ¶meters)) { From d9aff26b2c0f1094106c60986b0433f980c77307 Mon Sep 17 00:00:00 2001 From: Ashish Karale Date: Wed, 30 Oct 2024 04:25:10 -0700 Subject: [PATCH 10/14] Add GetParameter to fetch string params --- src/tensorrt_model.cc | 39 +++++++++++++++++++++++++++++---------- src/tensorrt_model.h | 12 ++++++++++++ 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/src/tensorrt_model.cc b/src/tensorrt_model.cc index 42fdc39..bd419d5 100644 --- a/src/tensorrt_model.cc +++ b/src/tensorrt_model.cc @@ -91,16 +91,11 @@ TensorRTModel::ParseModelConfig() } #ifdef TRITON_ENABLE_CUDA_CTX_SHARING - triton::common::TritonJson::Value parameters; - if (model_config_.Find("parameters", ¶meters)) { - triton::common::TritonJson::Value value; - std::string ptr_value; - if (parameters.Find("CUDA_CONTEXT_PTR", &value)) { - RETURN_IF_ERROR(value.MemberAsString("string_value", &ptr_value)); - cuda_ctx = static_cast(StringToPointer(ptr_value)); - LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "Cuda Context pointer is set"); - } - } + std::string ptr_str = ""; + RETURN_IF_ERROR(GetParameter("CUDA_CONTEXT_PTR", ptr_str)); + cuda_ctx = static_cast(StringToPointer(ptr_str)); + // cuda_ctx = static_cast(reinterpret_cast(ptr_str)); + LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "Cuda Context pointer is set"); #endif // TRITON_ENABLE_CUDA_CTX_SHARING return nullptr; // Success @@ -133,4 +128,28 @@ TensorRTModel::GetCudaStreamPriority() return cuda_stream_priority; } +template <> +TRITONSERVER_Error* +TensorRTModel::GetParameter( + std::string const& name, std::string& str_value) +{ + triton::common::TritonJson::Value parameters; + TRITONSERVER_Error* err = + model_config_.MemberAsObject("parameters", ¶meters); + if (err != nullptr) { + return err; + // throw std::runtime_error("Model config doesn't have a parameters + // section"); + } + triton::common::TritonJson::Value value; + err = parameters.MemberAsObject(name.c_str(), &value); + if (err != nullptr) { + return err; + // std::string errStr = "Cannot find parameter with name: " + name; + // throw std::runtime_error(errStr); + } + value.MemberAsString("string_value", &str_value); + return nullptr; +} + }}} // namespace triton::backend::tensorrt diff --git a/src/tensorrt_model.h b/src/tensorrt_model.h index 2de730e..7e5fe92 100644 --- a/src/tensorrt_model.h +++ b/src/tensorrt_model.h @@ -39,6 +39,14 @@ class TensorRTModel : public BackendModel { TensorRTModel(TRITONBACKEND_Model* triton_model); virtual ~TensorRTModel() = default; + template + TRITONSERVER_Error* GetParameter(std::string const& name, T& value) + { + assert(false); + auto dummy = T(); + return dummy; + } + TRITONSERVER_Error* SetTensorRTModelConfig(); TRITONSERVER_Error* ParseModelConfig(); @@ -58,6 +66,10 @@ class TensorRTModel : public BackendModel { bool EagerBatching() { return eager_batching_; } bool BusyWaitEvents() { return busy_wait_events_; } + template <> + TRITONSERVER_Error* GetParameter( + std::string const& name, std::string& str_value); + void* StringToPointer(std::string& str) { std::stringstream ss; From 2ca19acf3a4cf00d167b785b8b39f6a68ef0229d Mon Sep 17 00:00:00 2001 From: Ashish Karale Date: Mon, 6 Jan 2025 00:37:44 -0800 Subject: [PATCH 11/14] Cig->Cuda + comment updates --- src/tensorrt.cc | 6 +++--- src/tensorrt_model.h | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/tensorrt.cc b/src/tensorrt.cc index cdc29b3..747b867 100644 --- a/src/tensorrt.cc +++ b/src/tensorrt.cc @@ -318,7 +318,7 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) DeviceMemoryTracker::TrackThreadMemoryUsage(lusage.get()); } - ScopedRuntimeCudaContext cig_scope(model_state); + ScopedRuntimeCudaContext cuda_scope(model_state); // With each instance we create a ModelInstanceState object and // associate it with the TRITONBACKEND_ModelInstance. @@ -358,7 +358,7 @@ TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance) return nullptr; } - ScopedRuntimeCudaContext cig_scope(instance_state->StateForModel()); + ScopedRuntimeCudaContext cuda_scope(instance_state->StateForModel()); delete instance_state; @@ -383,7 +383,7 @@ TRITONBACKEND_ModelInstanceExecute( instance, reinterpret_cast(&instance_state))); ModelState* model_state = instance_state->StateForModel(); - ScopedRuntimeCudaContext cig_scope(model_state); + ScopedRuntimeCudaContext cuda_scope(model_state); // For TensorRT backend, the executing instance may not closely tie to // TRITONBACKEND_ModelInstance, the instance will be assigned based on diff --git a/src/tensorrt_model.h b/src/tensorrt_model.h index 7e5fe92..413e808 100644 --- a/src/tensorrt_model.h +++ b/src/tensorrt_model.h @@ -80,10 +80,10 @@ class TensorRTModel : public BackendModel { return ctx_ptr; } - //! Following functions are related to Cuda (Cuda in Graphics) context sharing - //! for gaming use case. Creating a shared contexts reduces context switching - //! overhead and leads to better performance of model execution along side - //! Graphics workload. + //! Following functions are related to custom Cuda context (Cuda in Graphics) + //! sharing for gaming use case. Creating a shared contexts reduces context + //! switching overhead and leads to better performance of model execution + //! along side Graphics workload. bool isCudaContextSharingEnabled() { From 20283c6d76eec9d6035ea71e4dd51d8f78a29c9d Mon Sep 17 00:00:00 2001 From: Ashish Karale Date: Mon, 6 Jan 2025 00:38:44 -0800 Subject: [PATCH 12/14] Use RETURN_IF_ERROR macro --- src/tensorrt_model.cc | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/src/tensorrt_model.cc b/src/tensorrt_model.cc index bd419d5..b466bfa 100644 --- a/src/tensorrt_model.cc +++ b/src/tensorrt_model.cc @@ -134,20 +134,11 @@ TensorRTModel::GetParameter( std::string const& name, std::string& str_value) { triton::common::TritonJson::Value parameters; - TRITONSERVER_Error* err = - model_config_.MemberAsObject("parameters", ¶meters); - if (err != nullptr) { - return err; - // throw std::runtime_error("Model config doesn't have a parameters - // section"); - } + RETURN_IF_ERROR(model_config_.MemberAsObject("parameters", ¶meters)); + triton::common::TritonJson::Value value; - err = parameters.MemberAsObject(name.c_str(), &value); - if (err != nullptr) { - return err; - // std::string errStr = "Cannot find parameter with name: " + name; - // throw std::runtime_error(errStr); - } + RETURN_IF_ERROR(parameters.MemberAsObject(name.c_str(), &value)); + value.MemberAsString("string_value", &str_value); return nullptr; } From e2a9336d9ceaf3034bed0c73cd86af2b80115baf Mon Sep 17 00:00:00 2001 From: Ashish Karale Date: Mon, 27 Jan 2025 08:07:03 -0800 Subject: [PATCH 13/14] Handle Multi-GPU failure case --- src/model_state.cc | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/src/model_state.cc b/src/model_state.cc index 5912725..b337c63 100644 --- a/src/model_state.cc +++ b/src/model_state.cc @@ -324,15 +324,25 @@ ModelState::AutoCompleteConfig() " to auto-complete config for " + Name()) .c_str())); - if (!isCudaContextSharingEnabled()) { - cuerr = cudaSetDevice(device_id); - if (cuerr != cudaSuccess) { - return TRITONSERVER_ErrorNew( - TRITONSERVER_ERROR_INTERNAL, - (std::string("unable to set CUDA device to GPU ") + - std::to_string(device_id) + " : " + cudaGetErrorString(cuerr)) - .c_str()); - } +#ifdef TRITON_ENABLE_CUDA_CTX_SHARING + // Return failure if Cuda context sharing is enabled and + // if it is a multi GPU setup + if (isCudaContextSharingEnabled() && device_id != 0) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string( + "Cuda context sharing is not supported on mult-GPU system.")) + .c_str()); + } +#endif // TRITON_ENABLE_CUDA_CTX_SHARING + + cuerr = cudaSetDevice(device_id); + if (cuerr != cudaSuccess) { + return TRITONSERVER_ErrorNew( + TRITONSERVER_ERROR_INTERNAL, + (std::string("unable to set CUDA device to GPU ") + + std::to_string(device_id) + " : " + cudaGetErrorString(cuerr)) + .c_str()); } std::string artifact_name; From 625d19e05740eb44e5359828d85a88a8129d11be Mon Sep 17 00:00:00 2001 From: Ashish Karale Date: Tue, 28 Jan 2025 02:37:43 -0800 Subject: [PATCH 14/14] typo + styling fixes --- src/instance_state.cc | 6 +++--- src/model_state.cc | 10 +++++----- src/tensorrt_model.h | 6 +++--- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/instance_state.cc b/src/instance_state.cc index 06b38b8..1e0517b 100644 --- a/src/instance_state.cc +++ b/src/instance_state.cc @@ -257,7 +257,7 @@ ModelInstanceState::ModelInstanceState( ModelInstanceState::~ModelInstanceState() { - if (!model_state_->isCudaContextSharingEnabled()) { + if (!model_state_->IsCudaContextSharingEnabled()) { cudaSetDevice(DeviceId()); } for (auto& io_binding_infos : io_binding_infos_) { @@ -426,7 +426,7 @@ ModelInstanceState::Run( payload_.reset(new Payload(next_set_, requests, request_count)); SET_TIMESTAMP(payload_->compute_start_ns_); - if (!model_state_->isCudaContextSharingEnabled()) { + if (!model_state_->IsCudaContextSharingEnabled()) { cudaSetDevice(DeviceId()); } #ifdef TRITON_ENABLE_STATS @@ -1555,7 +1555,7 @@ ModelInstanceState::EvaluateTensorRTContext( TRITONSERVER_Error* ModelInstanceState::InitStreamsAndEvents() { - if (!model_state_->isCudaContextSharingEnabled()) { + if (!model_state_->IsCudaContextSharingEnabled()) { // Set the device before preparing the context. auto cuerr = cudaSetDevice(DeviceId()); if (cuerr != cudaSuccess) { diff --git a/src/model_state.cc b/src/model_state.cc index b337c63..9d547c1 100644 --- a/src/model_state.cc +++ b/src/model_state.cc @@ -175,7 +175,7 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model) ModelState::~ModelState() { for (auto& device_engine : device_engines_) { - if (!isCudaContextSharingEnabled()) { + if (!IsCudaContextSharingEnabled()) { cudaSetDevice(device_engine.first.first); } auto& runtime = device_engine.second.first; @@ -211,7 +211,7 @@ ModelState::CreateEngine( // We share the engine (for models that don't have dynamic shapes) and // runtime across instances that have access to the same GPU/NVDLA. if (eit->second.second == nullptr) { - if (!isCudaContextSharingEnabled()) { + if (!IsCudaContextSharingEnabled()) { auto cuerr = cudaSetDevice(gpu_device); if (cuerr != cudaSuccess) { return TRITONSERVER_ErrorNew( @@ -327,11 +327,11 @@ ModelState::AutoCompleteConfig() #ifdef TRITON_ENABLE_CUDA_CTX_SHARING // Return failure if Cuda context sharing is enabled and // if it is a multi GPU setup - if (isCudaContextSharingEnabled() && device_id != 0) { + if (IsCudaContextSharingEnabled() && device_id != 0) { return TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INTERNAL, (std::string( - "Cuda context sharing is not supported on mult-GPU system.")) + "Cuda context sharing is not supported on multi-GPU system.")) .c_str()); } #endif // TRITON_ENABLE_CUDA_CTX_SHARING @@ -388,7 +388,7 @@ ModelState::AutoCompleteConfig() RETURN_IF_ERROR(AutoCompleteConfigHelper(model_path)); - if (!isCudaContextSharingEnabled()) { + if (!IsCudaContextSharingEnabled()) { cuerr = cudaSetDevice(current_device); if (cuerr != cudaSuccess) { return TRITONSERVER_ErrorNew( diff --git a/src/tensorrt_model.h b/src/tensorrt_model.h index 413e808..bbc2329 100644 --- a/src/tensorrt_model.h +++ b/src/tensorrt_model.h @@ -85,7 +85,7 @@ class TensorRTModel : public BackendModel { //! switching overhead and leads to better performance of model execution //! along side Graphics workload. - bool isCudaContextSharingEnabled() + bool IsCudaContextSharingEnabled() { #ifdef TRITON_ENABLE_CUDA_CTX_SHARING return cuda_ctx != nullptr; @@ -143,7 +143,7 @@ struct ScopedRuntimeCudaContext { : model_state_(model_state) { #ifdef TRITON_ENABLE_CUDA_CTX_SHARING - if (model_state_->isCudaContextSharingEnabled()) { + if (model_state_->IsCudaContextSharingEnabled()) { THROW_IF_BACKEND_MODEL_ERROR(model_state_->PushCudaContext()); } #endif // TRITON_ENABLE_CUDA_CTX_SHARING @@ -151,7 +151,7 @@ struct ScopedRuntimeCudaContext { ~ScopedRuntimeCudaContext() { #ifdef TRITON_ENABLE_CUDA_CTX_SHARING - if (model_state_->isCudaContextSharingEnabled()) { + if (model_state_->IsCudaContextSharingEnabled()) { THROW_IF_BACKEND_MODEL_ERROR(model_state_->PopCudaContext()); } #endif // TRITON_ENABLE_CUDA_CTX_SHARING