triton-inference-server · mc-nv · Jan 30, 2025 · Jan 30, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -37,8 +37,6 @@ set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard which feat
 option(TRITON_ENABLE_GPU "Enable GPU support in backend." ON)
 option(TRITON_ENABLE_STATS "Include statistics collections in backend." ON)
 option(TRITON_ENABLE_NVTX "Include nvtx markers collection in backend." OFF)
-option(TRITON_ENABLE_CUDA_CTX_SHARING "Enable Cuda context sharing support in backend." OFF)
-
 set(TRITON_TENSORRT_LIB_PATHS "" CACHE PATH "Paths to TensorRT libraries. Multiple paths may be specified by separating them with a semicolon.")
 set(TRITON_TENSORRT_INCLUDE_PATHS "" CACHE PATH "Paths to TensorRT includes. Multiple paths may be specified by separating them with a semicolon.")
 
@@ -234,17 +232,6 @@ target_link_libraries(
       CUDA::cudart
 )
 
-if(${TRITON_ENABLE_CUDA_CTX_SHARING})
-  target_compile_definitions(
-      triton-tensorrt-backend
-      PRIVATE TRITON_ENABLE_CUDA_CTX_SHARING
-  )
-  target_link_libraries(
-      triton-tensorrt-backend
-      PRIVATE
-        CUDA::cuda_driver
-  )
-endif()
 
 #
 # Install

diff --git a/src/instance_state.cc b/src/instance_state.cc
@@ -257,9 +257,7 @@ ModelInstanceState::ModelInstanceState(
 
 ModelInstanceState::~ModelInstanceState()
 {
-  if (!model_state_->IsCudaContextSharingEnabled()) {
-    cudaSetDevice(DeviceId());
-  }
+  cudaSetDevice(DeviceId());
   for (auto& io_binding_infos : io_binding_infos_) {
     for (auto& io_binding_info : io_binding_infos) {
       if (!io_binding_info.IsDynamicShapeOutput() &&
@@ -426,9 +424,7 @@ ModelInstanceState::Run(
   payload_.reset(new Payload(next_set_, requests, request_count));
   SET_TIMESTAMP(payload_->compute_start_ns_);
 
-  if (!model_state_->IsCudaContextSharingEnabled()) {
-    cudaSetDevice(DeviceId());
-  }
+  cudaSetDevice(DeviceId());
 #ifdef TRITON_ENABLE_STATS
   {
     SET_TIMESTAMP(payload_->compute_start_ns_);
@@ -1555,16 +1551,13 @@ ModelInstanceState::EvaluateTensorRTContext(
 TRITONSERVER_Error*
 ModelInstanceState::InitStreamsAndEvents()
 {
-  if (!model_state_->IsCudaContextSharingEnabled()) {
-    // Set the device before preparing the context.
-    auto cuerr = cudaSetDevice(DeviceId());
-    if (cuerr != cudaSuccess) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          (std::string("unable to set device for ") + Name() + ": " +
-           cudaGetErrorString(cuerr))
-              .c_str());
-    }
+  // Set the device before preparing the context.
+  auto cuerr = cudaSetDevice(DeviceId());
+  if (cuerr != cudaSuccess) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL, (std::string("unable to set device for ") +
+                                      Name() + ": " + cudaGetErrorString(cuerr))
+                                         .c_str());
   }
 
   // Create CUDA streams associated with the instance

diff --git a/src/model_state.cc b/src/model_state.cc
@@ -175,9 +175,7 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model)
 ModelState::~ModelState()
 {
   for (auto& device_engine : device_engines_) {
-    if (!IsCudaContextSharingEnabled()) {
-      cudaSetDevice(device_engine.first.first);
-    }
+    cudaSetDevice(device_engine.first.first);
     auto& runtime = device_engine.second.first;
     auto& engine = device_engine.second.second;
     // Need to reset explicitly to ensure proper destruction order
@@ -211,16 +209,15 @@ ModelState::CreateEngine(
   // We share the engine (for models that don't have dynamic shapes) and
   // runtime across instances that have access to the same GPU/NVDLA.
   if (eit->second.second == nullptr) {
-    if (!IsCudaContextSharingEnabled()) {
-      auto cuerr = cudaSetDevice(gpu_device);
-      if (cuerr != cudaSuccess) {
-        return TRITONSERVER_ErrorNew(
-            TRITONSERVER_ERROR_INTERNAL,
-            (std::string("unable to set device for ") + Name() + ": " +
-             cudaGetErrorString(cuerr))
-                .c_str());
-      }
+    auto cuerr = cudaSetDevice(gpu_device);
+    if (cuerr != cudaSuccess) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          (std::string("unable to set device for ") + Name() + ": " +
+           cudaGetErrorString(cuerr))
+              .c_str());
     }
+
     const bool new_runtime = (eit->second.first == nullptr);
     RETURN_IF_ERROR(LoadPlan(
         model_path, dla_core_id, &eit->second.first, &eit->second.second,
@@ -324,18 +321,6 @@ ModelState::AutoCompleteConfig()
            " to auto-complete config for " + Name())
            .c_str()));
 
-#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
-  // Return failure if Cuda context sharing is enabled and
-  // if it is a multi GPU setup
-  if (IsCudaContextSharingEnabled() && device_id != 0) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INTERNAL,
-        (std::string(
-             "Cuda context sharing is not supported on multi-GPU system."))
-            .c_str());
-  }
-#endif  // TRITON_ENABLE_CUDA_CTX_SHARING
-
   cuerr = cudaSetDevice(device_id);
   if (cuerr != cudaSuccess) {
     return TRITONSERVER_ErrorNew(
@@ -388,15 +373,13 @@ ModelState::AutoCompleteConfig()
 
   RETURN_IF_ERROR(AutoCompleteConfigHelper(model_path));
 
-  if (!IsCudaContextSharingEnabled()) {
-    cuerr = cudaSetDevice(current_device);
-    if (cuerr != cudaSuccess) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          (std::string("unable to revert CUDA device to GPU ") +
-           std::to_string(current_device) + " : " + cudaGetErrorString(cuerr))
-              .c_str());
-    }
+  cuerr = cudaSetDevice(current_device);
+  if (cuerr != cudaSuccess) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        (std::string("unable to revert CUDA device to GPU ") +
+         std::to_string(current_device) + " : " + cudaGetErrorString(cuerr))
+            .c_str());
   }
 
   if (TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_VERBOSE)) {

diff --git a/src/tensorrt.cc b/src/tensorrt.cc
@@ -318,7 +318,6 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
     DeviceMemoryTracker::TrackThreadMemoryUsage(lusage.get());
   }
 
-  ScopedRuntimeCudaContext cuda_scope(model_state);
 
   // With each instance we create a ModelInstanceState object and
   // associate it with the TRITONBACKEND_ModelInstance.
@@ -354,11 +353,6 @@ TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
   LOG_MESSAGE(
       TRITONSERVER_LOG_INFO,
       "TRITONBACKEND_ModelInstanceFinalize: delete instance state");
-  if (!instance_state) {
-    return nullptr;
-  }
-
-  ScopedRuntimeCudaContext cuda_scope(instance_state->StateForModel());
 
   delete instance_state;
 
@@ -383,8 +377,6 @@ TRITONBACKEND_ModelInstanceExecute(
       instance, reinterpret_cast<void**>(&instance_state)));
   ModelState* model_state = instance_state->StateForModel();
 
-  ScopedRuntimeCudaContext cuda_scope(model_state);
-
   // For TensorRT backend, the executing instance may not closely tie to
   // TRITONBACKEND_ModelInstance, the instance will be assigned based on
   // execution policy.

diff --git a/src/tensorrt_model.cc b/src/tensorrt_model.cc
@@ -90,14 +90,6 @@ TensorRTModel::ParseModelConfig()
     }
   }
 
-#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
-  std::string ptr_str = "";
-  RETURN_IF_ERROR(GetParameter("CUDA_CONTEXT_PTR", ptr_str));
-  cuda_ctx = static_cast<CUcontext>(StringToPointer(ptr_str));
-  // cuda_ctx = static_cast<CUcontext>(reinterpret_cast<void*>(ptr_str));
-  LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "Cuda Context pointer is set");
-#endif  // TRITON_ENABLE_CUDA_CTX_SHARING
-
   return nullptr;  // Success
 }
 
@@ -128,19 +120,4 @@ TensorRTModel::GetCudaStreamPriority()
   return cuda_stream_priority;
 }
 
-template <>
-TRITONSERVER_Error*
-TensorRTModel::GetParameter<std::string>(
-    std::string const& name, std::string& str_value)
-{
-  triton::common::TritonJson::Value parameters;
-  RETURN_IF_ERROR(model_config_.MemberAsObject("parameters", &parameters));
-
-  triton::common::TritonJson::Value value;
-  RETURN_IF_ERROR(parameters.MemberAsObject(name.c_str(), &value));
-
-  value.MemberAsString("string_value", &str_value);
-  return nullptr;
-}
-
 }}}  // namespace triton::backend::tensorrt
diff --git a/src/tensorrt_model.h b/src/tensorrt_model.h
@@ -25,11 +25,6 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #pragma once
 
-#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
-#include <cuda.h>
-#endif  // TRITON_ENABLE_CUDA_CTX_SHARING
-#include <sstream>
-
 #include "triton/backend/backend_model.h"
 
 namespace triton { namespace backend { namespace tensorrt {
@@ -39,14 +34,6 @@ class TensorRTModel : public BackendModel {
   TensorRTModel(TRITONBACKEND_Model* triton_model);
   virtual ~TensorRTModel() = default;
 
-  template <typename T>
-  TRITONSERVER_Error* GetParameter(std::string const& name, T& value)
-  {
-    assert(false);
-    auto dummy = T();
-    return dummy;
-  }
-
   TRITONSERVER_Error* SetTensorRTModelConfig();
 
   TRITONSERVER_Error* ParseModelConfig();
@@ -66,65 +53,6 @@ class TensorRTModel : public BackendModel {
   bool EagerBatching() { return eager_batching_; }
   bool BusyWaitEvents() { return busy_wait_events_; }
 
-  template <>
-  TRITONSERVER_Error* GetParameter<std::string>(
-      std::string const& name, std::string& str_value);
-
-  void* StringToPointer(std::string& str)
-  {
-    std::stringstream ss;
-    ss << str;
-
-    void* ctx_ptr;
-    ss >> ctx_ptr;
-    return ctx_ptr;
-  }
-
-  //! Following functions are related to custom Cuda context (Cuda in Graphics)
-  //! sharing for gaming use case. Creating a shared contexts reduces context
-  //! switching overhead and leads to better performance of model execution
-  //! along side Graphics workload.
-
-  bool IsCudaContextSharingEnabled()
-  {
-#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
-    return cuda_ctx != nullptr;
-#else
-    return false;
-#endif  // TRITON_ENABLE_CUDA_CTX_SHARING
-  }
-
-  inline TRITONSERVER_Error* PushCudaContext()
-  {
-#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
-    if (CUDA_SUCCESS != cuCtxPushCurrent(cuda_ctx)) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          (std::string("unable to push Cuda context for ") + Name()).c_str());
-    }
-#endif  // TRITON_ENABLE_CUDA_CTX_SHARING
-    return nullptr;
-  }
-
-  inline TRITONSERVER_Error* PopCudaContext()
-  {
-#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
-    CUcontext oldCtx{};
-    if (CUDA_SUCCESS != cuCtxPopCurrent(&oldCtx)) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          (std::string("unable to pop Cuda context for ") + Name()).c_str());
-    }
-    if (oldCtx != cuda_ctx) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          (std::string("popping the wrong Cuda context for ") + Name())
-              .c_str());
-    }
-#endif  // TRITON_ENABLE_CUDA_CTX_SHARING
-    return nullptr;
-  }
-
  protected:
   common::TritonJson::Value graph_specs_;
   Priority priority_;
@@ -133,30 +61,6 @@ class TensorRTModel : public BackendModel {
   bool separate_output_stream_;
   bool eager_batching_;
   bool busy_wait_events_;
-#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
-  CUcontext cuda_ctx = nullptr;
-#endif  // TRITON_ENABLE_CUDA_CTX_SHARING
-};
-
-struct ScopedRuntimeCudaContext {
-  ScopedRuntimeCudaContext(TensorRTModel* model_state)
-      : model_state_(model_state)
-  {
-#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
-    if (model_state_->IsCudaContextSharingEnabled()) {
-      THROW_IF_BACKEND_MODEL_ERROR(model_state_->PushCudaContext());
-    }
-#endif  // TRITON_ENABLE_CUDA_CTX_SHARING
-  }
-  ~ScopedRuntimeCudaContext()
-  {
-#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
-    if (model_state_->IsCudaContextSharingEnabled()) {
-      THROW_IF_BACKEND_MODEL_ERROR(model_state_->PopCudaContext());
-    }
-#endif  // TRITON_ENABLE_CUDA_CTX_SHARING
-  }
-  TensorRTModel* model_state_;
 };
 
 }}}  // namespace triton::backend::tensorrt