From 134bf3396c6542b88bc1ef1b62469a9e2f012d82 Mon Sep 17 00:00:00 2001
From: Ashish Karale <akarale@nvidia.com>
Date: Wed, 28 Aug 2024 08:27:35 +0000
Subject: [PATCH 01/14] Enable CiG support in Tensorrt backend

---
 CMakeLists.txt        |  1 +
 src/instance_state.cc | 28 ++++++++++++++-------
 src/model_state.cc    | 57 ++++++++++++++++++++++++++-----------------
 src/tensorrt.cc       | 35 ++++++++++++++++++++++++++
 src/tensorrt_model.cc | 18 ++++++++++++--
 src/tensorrt_model.h  |  5 ++++
 6 files changed, 110 insertions(+), 34 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7a70a85..b798d11 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -269,6 +269,7 @@ target_link_libraries(
     triton-tensorrt-backend
     PRIVATE
       CUDA::cudart
+      CUDA::cuda_driver
 )
 
 
diff --git a/src/instance_state.cc b/src/instance_state.cc
index 56208a1..e7113d9 100644
--- a/src/instance_state.cc
+++ b/src/instance_state.cc
@@ -257,7 +257,10 @@ ModelInstanceState::ModelInstanceState(
 
 ModelInstanceState::~ModelInstanceState()
 {
-  cudaSetDevice(DeviceId());
+  // Set device if CiG is disabled
+  if (!model_state_->isCiGEnabled()) {
+    cudaSetDevice(DeviceId());
+  }
   for (auto& io_binding_infos : io_binding_infos_) {
     for (auto& io_binding_info : io_binding_infos) {
       if (!io_binding_info.IsDynamicShapeOutput() &&
@@ -424,7 +427,10 @@ ModelInstanceState::Run(
   payload_.reset(new Payload(next_set_, requests, request_count));
   SET_TIMESTAMP(payload_->compute_start_ns_);
 
-  cudaSetDevice(DeviceId());
+  // Set device if CiG is disabled
+  if (!model_state_->isCiGEnabled()) {
+    cudaSetDevice(DeviceId());
+  }
 #ifdef TRITON_ENABLE_STATS
   {
     SET_TIMESTAMP(payload_->compute_start_ns_);
@@ -1551,13 +1557,17 @@ ModelInstanceState::EvaluateTensorRTContext(
 TRITONSERVER_Error*
 ModelInstanceState::InitStreamsAndEvents()
 {
-  // Set the device before preparing the context.
-  auto cuerr = cudaSetDevice(DeviceId());
-  if (cuerr != cudaSuccess) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INTERNAL, (std::string("unable to set device for ") +
-                                      Name() + ": " + cudaGetErrorString(cuerr))
-                                         .c_str());
+  // Set device if CiG is disabled
+  if (!model_state_->isCiGEnabled()) {
+    // Set the device before preparing the context.
+    auto cuerr = cudaSetDevice(DeviceId());
+    if (cuerr != cudaSuccess) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          (std::string("unable to set device for ") + Name() + ": " +
+           cudaGetErrorString(cuerr))
+              .c_str());
+    }
   }
 
   // Create CUDA streams associated with the instance
diff --git a/src/model_state.cc b/src/model_state.cc
index 6127989..0622a94 100644
--- a/src/model_state.cc
+++ b/src/model_state.cc
@@ -175,7 +175,10 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model)
 ModelState::~ModelState()
 {
   for (auto& device_engine : device_engines_) {
-    cudaSetDevice(device_engine.first.first);
+    // Set device if CiG is disabled
+    if (!isCiGEnabled()) {
+      cudaSetDevice(device_engine.first.first);
+    }
     auto& runtime = device_engine.second.first;
     auto& engine = device_engine.second.second;
     // Need to reset explicitly to ensure proper destruction order
@@ -209,15 +212,17 @@ ModelState::CreateEngine(
   // We share the engine (for models that don't have dynamic shapes) and
   // runtime across instances that have access to the same GPU/NVDLA.
   if (eit->second.second == nullptr) {
-    auto cuerr = cudaSetDevice(gpu_device);
-    if (cuerr != cudaSuccess) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          (std::string("unable to set device for ") + Name() + ": " +
-           cudaGetErrorString(cuerr))
-              .c_str());
+    // Set device if CiG is disabled
+    if (!isCiGEnabled()) {
+      auto cuerr = cudaSetDevice(gpu_device);
+      if (cuerr != cudaSuccess) {
+        return TRITONSERVER_ErrorNew(
+            TRITONSERVER_ERROR_INTERNAL,
+            (std::string("unable to set device for ") + Name() + ": " +
+             cudaGetErrorString(cuerr))
+                .c_str());
+      }
     }
-
     const bool new_runtime = (eit->second.first == nullptr);
     RETURN_IF_ERROR(LoadPlan(
         model_path, dla_core_id, &eit->second.first, &eit->second.second,
@@ -321,13 +326,16 @@ ModelState::AutoCompleteConfig()
            " to auto-complete config for " + Name())
            .c_str()));
 
-  cuerr = cudaSetDevice(device_id);
-  if (cuerr != cudaSuccess) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INTERNAL,
-        (std::string("unable to set CUDA device to GPU ") +
-         std::to_string(device_id) + " : " + cudaGetErrorString(cuerr))
-            .c_str());
+  // Set device if CiG is disabled
+  if (!isCiGEnabled()) {
+    cuerr = cudaSetDevice(device_id);
+    if (cuerr != cudaSuccess) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          (std::string("unable to set CUDA device to GPU ") +
+           std::to_string(device_id) + " : " + cudaGetErrorString(cuerr))
+              .c_str());
+    }
   }
 
   std::string artifact_name;
@@ -373,13 +381,16 @@ ModelState::AutoCompleteConfig()
 
   RETURN_IF_ERROR(AutoCompleteConfigHelper(model_path));
 
-  cuerr = cudaSetDevice(current_device);
-  if (cuerr != cudaSuccess) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INTERNAL,
-        (std::string("unable to revert CUDA device to GPU ") +
-         std::to_string(current_device) + " : " + cudaGetErrorString(cuerr))
-            .c_str());
+  // Set device if CiG is disabled
+  if (!isCiGEnabled()) {
+    cuerr = cudaSetDevice(current_device);
+    if (cuerr != cudaSuccess) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          (std::string("unable to revert CUDA device to GPU ") +
+           std::to_string(current_device) + " : " + cudaGetErrorString(cuerr))
+              .c_str());
+    }
   }
 
   if (TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_VERBOSE)) {
diff --git a/src/tensorrt.cc b/src/tensorrt.cc
index 2c2d2a4..1517035 100644
--- a/src/tensorrt.cc
+++ b/src/tensorrt.cc
@@ -318,6 +318,13 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
     DeviceMemoryTracker::TrackThreadMemoryUsage(lusage.get());
   }
 
+  CUcontext cig_ctx = model_state->GetCiGContext();
+  if (cig_ctx != nullptr) {
+    auto result = cuCtxPushCurrent(cig_ctx);
+    RETURN_ERROR_IF_FALSE(
+        result == CUDA_SUCCESS, TRITONSERVER_ERROR_INTERNAL,
+        std::string("Error while setting cig context"));
+  }
 
   // With each instance we create a ModelInstanceState object and
   // associate it with the TRITONBACKEND_ModelInstance.
@@ -336,6 +343,10 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
         instance, ba_array, ba_len));
   }
 
+  if (cig_ctx != nullptr) {
+    cuCtxPopCurrent(&cig_ctx);
+  }
+
   return nullptr;  // success
 }
 
@@ -350,12 +361,24 @@ TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
   ModelInstanceState* instance_state =
       reinterpret_cast<ModelInstanceState*>(vstate);
 
+  CUcontext cig_ctx = instance_state->StateForModel()->GetCiGContext();
+  if (cig_ctx != nullptr) {
+    auto result = cuCtxPushCurrent(cig_ctx);
+    RETURN_ERROR_IF_FALSE(
+        result == CUDA_SUCCESS, TRITONSERVER_ERROR_INTERNAL,
+        std::string("Error while setting cig context"));
+  }
+
   LOG_MESSAGE(
       TRITONSERVER_LOG_INFO,
       "TRITONBACKEND_ModelInstanceFinalize: delete instance state");
 
   delete instance_state;
 
+  if (cig_ctx != nullptr) {
+    cuCtxPopCurrent(&cig_ctx);
+  }
+
   return nullptr;  // success
 }
 
@@ -377,6 +400,14 @@ TRITONBACKEND_ModelInstanceExecute(
       instance, reinterpret_cast<void**>(&instance_state)));
   ModelState* model_state = instance_state->StateForModel();
 
+  CUcontext cig_ctx = instance_state->StateForModel()->GetCiGContext();
+  if (cig_ctx != nullptr) {
+    auto result = cuCtxPushCurrent(cig_ctx);
+    RETURN_ERROR_IF_FALSE(
+        result == CUDA_SUCCESS, TRITONSERVER_ERROR_INTERNAL,
+        std::string("Error while setting cig context"));
+  }
+
   // For TensorRT backend, the executing instance may not closely tie to
   // TRITONBACKEND_ModelInstance, the instance will be assigned based on
   // execution policy.
@@ -411,6 +442,10 @@ TRITONBACKEND_ModelInstanceExecute(
   // arrive between when the batch is formed and when batch is executed.
   semaphore->Acquire();
 
+  if (cig_ctx != nullptr) {
+    cuCtxPopCurrent(&cig_ctx);
+  }
+
   return nullptr;  // success
 }
 
diff --git a/src/tensorrt_model.cc b/src/tensorrt_model.cc
index bf2959d..7edcec6 100644
--- a/src/tensorrt_model.cc
+++ b/src/tensorrt_model.cc
@@ -25,6 +25,7 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "tensorrt_model.h"
+#include <sstream>
 
 namespace triton { namespace backend { namespace tensorrt {
 
@@ -53,7 +54,7 @@ TensorRTModel::TensorRTModel(TRITONBACKEND_Model* triton_model)
     : BackendModel(triton_model), priority_(Priority::DEFAULT),
       use_cuda_graphs_(false), gather_kernel_buffer_threshold_(0),
       separate_output_stream_(false), eager_batching_(false),
-      busy_wait_events_(false)
+      busy_wait_events_(false), cig_ctx_(nullptr)
 {
   ParseModelConfig();
 }
@@ -89,7 +90,20 @@ TensorRTModel::ParseModelConfig()
           cuda.MemberAsBool("output_copy_stream", &separate_output_stream_));
     }
   }
-
+  triton::common::TritonJson::Value parameters;
+  if (model_config_.Find("parameters", &parameters)) {
+    triton::common::TritonJson::Value value;
+    std::string ptr_value;
+    if (parameters.Find("CIG_CONTEXT_PTR", &value)) {
+      RETURN_IF_ERROR(value.MemberAsString("string_value", &ptr_value));
+      std::stringstream ss;
+      ss << ptr_value;
+      void* ctx_ptr;
+      ss >> ctx_ptr;
+      cig_ctx_ = static_cast<CUcontext>(ctx_ptr);
+      LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "CiG Context pointer is set");
+    }
+  }
   return nullptr;  // Success
 }
 
diff --git a/src/tensorrt_model.h b/src/tensorrt_model.h
index 86c67a2..0da6682 100644
--- a/src/tensorrt_model.h
+++ b/src/tensorrt_model.h
@@ -26,6 +26,7 @@
 #pragma once
 
 #include "triton/backend/backend_model.h"
+#include <cuda.h>
 
 namespace triton { namespace backend { namespace tensorrt {
 
@@ -53,6 +54,9 @@ class TensorRTModel : public BackendModel {
   bool EagerBatching() { return eager_batching_; }
   bool BusyWaitEvents() { return busy_wait_events_; }
 
+  CUcontext GetCiGContext() { return cig_ctx_; }
+  bool isCiGEnabled() { return cig_ctx_ != nullptr; }
+
  protected:
   common::TritonJson::Value graph_specs_;
   Priority priority_;
@@ -61,6 +65,7 @@ class TensorRTModel : public BackendModel {
   bool separate_output_stream_;
   bool eager_batching_;
   bool busy_wait_events_;
+  CUcontext cig_ctx_;
 };
 
 }}}  // namespace triton::backend::tensorrt

From 7353671a81f602607a3fbd5d7c1d31ea1c9e7aaf Mon Sep 17 00:00:00 2001
From: Ashish Karale <akarale@nvidia.com>
Date: Wed, 28 Aug 2024 09:43:51 +0000
Subject: [PATCH 02/14] Creating scoped runtime context structure for better
 management of CiG context

---
 src/tensorrt.cc      | 38 ++++--------------------------------
 src/tensorrt_model.h | 46 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 34 deletions(-)

diff --git a/src/tensorrt.cc b/src/tensorrt.cc
index 1517035..fe2ad88 100644
--- a/src/tensorrt.cc
+++ b/src/tensorrt.cc
@@ -318,13 +318,7 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
     DeviceMemoryTracker::TrackThreadMemoryUsage(lusage.get());
   }
 
-  CUcontext cig_ctx = model_state->GetCiGContext();
-  if (cig_ctx != nullptr) {
-    auto result = cuCtxPushCurrent(cig_ctx);
-    RETURN_ERROR_IF_FALSE(
-        result == CUDA_SUCCESS, TRITONSERVER_ERROR_INTERNAL,
-        std::string("Error while setting cig context"));
-  }
+  ScopedRuntimeCiGContext cig_scope(model_state);
 
   // With each instance we create a ModelInstanceState object and
   // associate it with the TRITONBACKEND_ModelInstance.
@@ -343,10 +337,6 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
         instance, ba_array, ba_len));
   }
 
-  if (cig_ctx != nullptr) {
-    cuCtxPopCurrent(&cig_ctx);
-  }
-
   return nullptr;  // success
 }
 
@@ -361,24 +351,14 @@ TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
   ModelInstanceState* instance_state =
       reinterpret_cast<ModelInstanceState*>(vstate);
 
-  CUcontext cig_ctx = instance_state->StateForModel()->GetCiGContext();
-  if (cig_ctx != nullptr) {
-    auto result = cuCtxPushCurrent(cig_ctx);
-    RETURN_ERROR_IF_FALSE(
-        result == CUDA_SUCCESS, TRITONSERVER_ERROR_INTERNAL,
-        std::string("Error while setting cig context"));
-  }
-
+  ScopedRuntimeCiGContext cig_scope(instance_state->StateForModel());
+  
   LOG_MESSAGE(
       TRITONSERVER_LOG_INFO,
       "TRITONBACKEND_ModelInstanceFinalize: delete instance state");
 
   delete instance_state;
 
-  if (cig_ctx != nullptr) {
-    cuCtxPopCurrent(&cig_ctx);
-  }
-
   return nullptr;  // success
 }
 
@@ -400,13 +380,7 @@ TRITONBACKEND_ModelInstanceExecute(
       instance, reinterpret_cast<void**>(&instance_state)));
   ModelState* model_state = instance_state->StateForModel();
 
-  CUcontext cig_ctx = instance_state->StateForModel()->GetCiGContext();
-  if (cig_ctx != nullptr) {
-    auto result = cuCtxPushCurrent(cig_ctx);
-    RETURN_ERROR_IF_FALSE(
-        result == CUDA_SUCCESS, TRITONSERVER_ERROR_INTERNAL,
-        std::string("Error while setting cig context"));
-  }
+  ScopedRuntimeCiGContext cig_scope(instance_state->StateForModel());
 
   // For TensorRT backend, the executing instance may not closely tie to
   // TRITONBACKEND_ModelInstance, the instance will be assigned based on
@@ -442,10 +416,6 @@ TRITONBACKEND_ModelInstanceExecute(
   // arrive between when the batch is formed and when batch is executed.
   semaphore->Acquire();
 
-  if (cig_ctx != nullptr) {
-    cuCtxPopCurrent(&cig_ctx);
-  }
-
   return nullptr;  // success
 }
 
diff --git a/src/tensorrt_model.h b/src/tensorrt_model.h
index 0da6682..9ad7f39 100644
--- a/src/tensorrt_model.h
+++ b/src/tensorrt_model.h
@@ -54,9 +54,38 @@ class TensorRTModel : public BackendModel {
   bool EagerBatching() { return eager_batching_; }
   bool BusyWaitEvents() { return busy_wait_events_; }
 
+
+  //! Following functions are related to CiG (Cuda in Graphics) context sharing for
+  //! gaming use case. Creating a shared contexts reduces context switching overhead
+  //! and leads to better performance of model execution along side Graphics workload.
   CUcontext GetCiGContext() { return cig_ctx_; }
   bool isCiGEnabled() { return cig_ctx_ != nullptr; }
 
+  inline TRITONSERVER_Error* PushCiGContext()
+  {
+    if (CUDA_SUCCESS != cuCtxPushCurrent(cig_ctx_)) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          (std::string("unable to push CiG context for ") + Name()).c_str());
+    }
+    return nullptr;
+  }
+
+  inline TRITONSERVER_Error* PopCiGContext()
+  {
+    CUcontext oldCtx{};
+    if (CUDA_SUCCESS != cuCtxPopCurrent(&oldCtx)) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          (std::string("unable to [pop CiG context for ") + Name()).c_str());
+    }
+    if (oldCtx != cig_ctx_) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL,
+          (std::string("popping the wrong CiG context for ") + Name()).c_str());
+    }
+  }
+
  protected:
   common::TritonJson::Value graph_specs_;
   Priority priority_;
@@ -68,4 +97,21 @@ class TensorRTModel : public BackendModel {
   CUcontext cig_ctx_;
 };
 
+struct ScopedRuntimeCiGContext {
+  ScopedRuntimeCiGContext(TensorRTModel* model_state)
+      : model_state_(model_state)
+  {
+    if (model_state_->isCiGEnabled()) {
+      THROW_IF_BACKEND_MODEL_ERROR(model_state_->PushCiGContext());
+    }
+  }
+  ~ScopedRuntimeCiGContext()
+  { 
+    if (model_state_->isCiGEnabled()) {
+      THROW_IF_BACKEND_MODEL_ERROR(model_state_->PopCiGContext());
+    }
+  }
+  TensorRTModel* model_state_;
+};
+
 }}}  // namespace triton::backend::tensorrt

From ed1296deff13331242c8d52695d7bd37df90bb45 Mon Sep 17 00:00:00 2001
From: Ashish Karale <akarale@nvidia.com>
Date: Wed, 28 Aug 2024 10:53:20 +0000
Subject: [PATCH 03/14] instance_state null check

---
 src/tensorrt.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/tensorrt.cc b/src/tensorrt.cc
index fe2ad88..aad6f21 100644
--- a/src/tensorrt.cc
+++ b/src/tensorrt.cc
@@ -350,12 +350,15 @@ TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
   RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));
   ModelInstanceState* instance_state =
       reinterpret_cast<ModelInstanceState*>(vstate);
-
-  ScopedRuntimeCiGContext cig_scope(instance_state->StateForModel());
   
   LOG_MESSAGE(
       TRITONSERVER_LOG_INFO,
       "TRITONBACKEND_ModelInstanceFinalize: delete instance state");
+  if (!instance_state)
+  {
+      return nullptr;
+  }
+  ScopedRuntimeCiGContext cig_scope(instance_state->StateForModel());
 
   delete instance_state;
 

From 05e37867f0efcd36bcd2a7931622cd2826ff404c Mon Sep 17 00:00:00 2001
From: Ashish Karale <akarale@nvidia.com>
Date: Thu, 5 Sep 2024 10:13:45 +0000
Subject: [PATCH 04/14] Minor bug fix

---
 src/tensorrt_model.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/tensorrt_model.h b/src/tensorrt_model.h
index 9ad7f39..c93bd58 100644
--- a/src/tensorrt_model.h
+++ b/src/tensorrt_model.h
@@ -84,6 +84,7 @@ class TensorRTModel : public BackendModel {
           TRITONSERVER_ERROR_INTERNAL,
           (std::string("popping the wrong CiG context for ") + Name()).c_str());
     }
+    return nullptr;
   }
 
  protected:

From 9ae5a091440b5465362011361843d046d8100a74 Mon Sep 17 00:00:00 2001
From: Ashish Karale <akarale@nvidia.com>
Date: Thu, 19 Sep 2024 12:18:02 +0530
Subject: [PATCH 05/14] pre-commit fixes

---
 src/tensorrt.cc       |  7 +++----
 src/tensorrt_model.cc |  1 +
 src/tensorrt_model.h  | 12 +++++++-----
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/tensorrt.cc b/src/tensorrt.cc
index aad6f21..1bd0266 100644
--- a/src/tensorrt.cc
+++ b/src/tensorrt.cc
@@ -350,13 +350,12 @@ TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
   RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));
   ModelInstanceState* instance_state =
       reinterpret_cast<ModelInstanceState*>(vstate);
-  
+
   LOG_MESSAGE(
       TRITONSERVER_LOG_INFO,
       "TRITONBACKEND_ModelInstanceFinalize: delete instance state");
-  if (!instance_state)
-  {
-      return nullptr;
+  if (!instance_state) {
+    return nullptr;
   }
   ScopedRuntimeCiGContext cig_scope(instance_state->StateForModel());
 
diff --git a/src/tensorrt_model.cc b/src/tensorrt_model.cc
index 7edcec6..8285189 100644
--- a/src/tensorrt_model.cc
+++ b/src/tensorrt_model.cc
@@ -25,6 +25,7 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "tensorrt_model.h"
+
 #include <sstream>
 
 namespace triton { namespace backend { namespace tensorrt {
diff --git a/src/tensorrt_model.h b/src/tensorrt_model.h
index c93bd58..708a51a 100644
--- a/src/tensorrt_model.h
+++ b/src/tensorrt_model.h
@@ -25,9 +25,10 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #pragma once
 
-#include "triton/backend/backend_model.h"
 #include <cuda.h>
 
+#include "triton/backend/backend_model.h"
+
 namespace triton { namespace backend { namespace tensorrt {
 
 class TensorRTModel : public BackendModel {
@@ -55,9 +56,10 @@ class TensorRTModel : public BackendModel {
   bool BusyWaitEvents() { return busy_wait_events_; }
 
 
-  //! Following functions are related to CiG (Cuda in Graphics) context sharing for
-  //! gaming use case. Creating a shared contexts reduces context switching overhead
-  //! and leads to better performance of model execution along side Graphics workload.
+  //! Following functions are related to CiG (Cuda in Graphics) context sharing
+  //! for gaming use case. Creating a shared contexts reduces context switching
+  //! overhead and leads to better performance of model execution along side
+  //! Graphics workload.
   CUcontext GetCiGContext() { return cig_ctx_; }
   bool isCiGEnabled() { return cig_ctx_ != nullptr; }
 
@@ -107,7 +109,7 @@ struct ScopedRuntimeCiGContext {
     }
   }
   ~ScopedRuntimeCiGContext()
-  { 
+  {
     if (model_state_->isCiGEnabled()) {
       THROW_IF_BACKEND_MODEL_ERROR(model_state_->PopCiGContext());
     }

From 89ab580a4d7571deacd16298a607abcbb5b93911 Mon Sep 17 00:00:00 2001
From: Ashish Karale <akarale@nvidia.com>
Date: Thu, 19 Sep 2024 08:30:00 +0000
Subject: [PATCH 06/14] Added new cmake flag TRITON_ENABLE_CIG to make the CiG
 support build conditional

---
 CMakeLists.txt        | 14 +++++++++++++-
 src/instance_state.cc | 15 ++++++++++++---
 src/model_state.cc    | 21 +++++++++++++++++----
 src/tensorrt.cc       |  8 +++++++-
 src/tensorrt_model.cc |  9 ++++++++-
 src/tensorrt_model.h  | 10 +++++++++-
 6 files changed, 66 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b798d11..c88248d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,6 +37,8 @@ set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard which feat
 option(TRITON_ENABLE_GPU "Enable GPU support in backend." ON)
 option(TRITON_ENABLE_STATS "Include statistics collections in backend." ON)
 option(TRITON_ENABLE_NVTX "Include nvtx markers collection in backend." OFF)
+option(TRITON_ENABLE_CIG "Enable Cuda in Graphics (CiG) support in backend." OFF)
+
 set(TRITON_TENSORRT_LIB_PATHS "" CACHE PATH "Paths to TensorRT libraries. Multiple paths may be specified by separating them with a semicolon.")
 set(TRITON_TENSORRT_INCLUDE_PATHS "" CACHE PATH "Paths to TensorRT includes. Multiple paths may be specified by separating them with a semicolon.")
 
@@ -269,9 +271,19 @@ target_link_libraries(
     triton-tensorrt-backend
     PRIVATE
       CUDA::cudart
-      CUDA::cuda_driver
 )
 
+if(${TRITON_ENABLE_CIG})
+  target_compile_definitions(
+      triton-tensorrt-backend
+      PRIVATE TRITON_ENABLE_CIG
+  )
+  target_link_libraries(
+      triton-tensorrt-backend
+      PRIVATE
+        CUDA::cuda_driver
+  )
+endif()
 
 #
 # Install
diff --git a/src/instance_state.cc b/src/instance_state.cc
index e7113d9..4a4fbc1 100644
--- a/src/instance_state.cc
+++ b/src/instance_state.cc
@@ -257,8 +257,11 @@ ModelInstanceState::ModelInstanceState(
 
 ModelInstanceState::~ModelInstanceState()
 {
+#ifdef TRITON_ENABLE_CIG
   // Set device if CiG is disabled
-  if (!model_state_->isCiGEnabled()) {
+  if (!model_state_->isCiGEnabled())
+#endif //TRITON_ENABLE_CIG 
+  {
     cudaSetDevice(DeviceId());
   }
   for (auto& io_binding_infos : io_binding_infos_) {
@@ -427,8 +430,11 @@ ModelInstanceState::Run(
   payload_.reset(new Payload(next_set_, requests, request_count));
   SET_TIMESTAMP(payload_->compute_start_ns_);
 
+#ifdef TRITON_ENABLE_CIG
   // Set device if CiG is disabled
-  if (!model_state_->isCiGEnabled()) {
+  if (!model_state_->isCiGEnabled())
+#endif //TRITON_ENABLE_CIG 
+  {
     cudaSetDevice(DeviceId());
   }
 #ifdef TRITON_ENABLE_STATS
@@ -1557,8 +1563,11 @@ ModelInstanceState::EvaluateTensorRTContext(
 TRITONSERVER_Error*
 ModelInstanceState::InitStreamsAndEvents()
 {
+#ifdef TRITON_ENABLE_CIG
   // Set device if CiG is disabled
-  if (!model_state_->isCiGEnabled()) {
+  if (!model_state_->isCiGEnabled())
+#endif //TRITON_ENABLE_CIG 
+  {
     // Set the device before preparing the context.
     auto cuerr = cudaSetDevice(DeviceId());
     if (cuerr != cudaSuccess) {
diff --git a/src/model_state.cc b/src/model_state.cc
index 0622a94..8b8b5d2 100644
--- a/src/model_state.cc
+++ b/src/model_state.cc
@@ -175,8 +175,11 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model)
 ModelState::~ModelState()
 {
   for (auto& device_engine : device_engines_) {
+#ifdef TRITON_ENABLE_CIG
     // Set device if CiG is disabled
-    if (!isCiGEnabled()) {
+    if (!isCiGEnabled()) 
+#endif //TRITON_ENABLE_CIG 
+    {
       cudaSetDevice(device_engine.first.first);
     }
     auto& runtime = device_engine.second.first;
@@ -212,8 +215,12 @@ ModelState::CreateEngine(
   // We share the engine (for models that don't have dynamic shapes) and
   // runtime across instances that have access to the same GPU/NVDLA.
   if (eit->second.second == nullptr) {
+
+#ifdef TRITON_ENABLE_CIG
     // Set device if CiG is disabled
-    if (!isCiGEnabled()) {
+    if (!isCiGEnabled()) 
+#endif //TRITON_ENABLE_CIG
+    {
       auto cuerr = cudaSetDevice(gpu_device);
       if (cuerr != cudaSuccess) {
         return TRITONSERVER_ErrorNew(
@@ -326,8 +333,11 @@ ModelState::AutoCompleteConfig()
            " to auto-complete config for " + Name())
            .c_str()));
 
+#ifdef TRITON_ENABLE_CIG
   // Set device if CiG is disabled
-  if (!isCiGEnabled()) {
+  if (!isCiGEnabled()) 
+#endif //TRITON_ENABLE_CIG
+  {
     cuerr = cudaSetDevice(device_id);
     if (cuerr != cudaSuccess) {
       return TRITONSERVER_ErrorNew(
@@ -381,8 +391,11 @@ ModelState::AutoCompleteConfig()
 
   RETURN_IF_ERROR(AutoCompleteConfigHelper(model_path));
 
+#ifdef TRITON_ENABLE_CIG
   // Set device if CiG is disabled
-  if (!isCiGEnabled()) {
+  if (!isCiGEnabled()) 
+#endif //TRITON_ENABLE_CIG
+  {
     cuerr = cudaSetDevice(current_device);
     if (cuerr != cudaSuccess) {
       return TRITONSERVER_ErrorNew(
diff --git a/src/tensorrt.cc b/src/tensorrt.cc
index 1bd0266..6476313 100644
--- a/src/tensorrt.cc
+++ b/src/tensorrt.cc
@@ -318,7 +318,9 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
     DeviceMemoryTracker::TrackThreadMemoryUsage(lusage.get());
   }
 
+#ifdef TRITON_ENABLE_CIG
   ScopedRuntimeCiGContext cig_scope(model_state);
+#endif //TRITON_ENABLE_CIG
 
   // With each instance we create a ModelInstanceState object and
   // associate it with the TRITONBACKEND_ModelInstance.
@@ -357,7 +359,9 @@ TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
   if (!instance_state) {
     return nullptr;
   }
+#ifdef TRITON_ENABLE_CIG
   ScopedRuntimeCiGContext cig_scope(instance_state->StateForModel());
+#endif //TRITON_ENABLE_CIG
 
   delete instance_state;
 
@@ -382,7 +386,9 @@ TRITONBACKEND_ModelInstanceExecute(
       instance, reinterpret_cast<void**>(&instance_state)));
   ModelState* model_state = instance_state->StateForModel();
 
-  ScopedRuntimeCiGContext cig_scope(instance_state->StateForModel());
+#ifdef TRITON_ENABLE_CIG
+  ScopedRuntimeCiGContext cig_scope(model_state);
+#endif //TRITON_ENABLE_CIG
 
   // For TensorRT backend, the executing instance may not closely tie to
   // TRITONBACKEND_ModelInstance, the instance will be assigned based on
diff --git a/src/tensorrt_model.cc b/src/tensorrt_model.cc
index 8285189..71259e9 100644
--- a/src/tensorrt_model.cc
+++ b/src/tensorrt_model.cc
@@ -55,7 +55,10 @@ TensorRTModel::TensorRTModel(TRITONBACKEND_Model* triton_model)
     : BackendModel(triton_model), priority_(Priority::DEFAULT),
       use_cuda_graphs_(false), gather_kernel_buffer_threshold_(0),
       separate_output_stream_(false), eager_batching_(false),
-      busy_wait_events_(false), cig_ctx_(nullptr)
+      busy_wait_events_(false)
+#ifdef TRITON_ENABLE_CIG
+      ,cig_ctx_(nullptr)
+#endif  // TRITON_ENABLE_CIG
 {
   ParseModelConfig();
 }
@@ -91,6 +94,8 @@ TensorRTModel::ParseModelConfig()
           cuda.MemberAsBool("output_copy_stream", &separate_output_stream_));
     }
   }
+
+#ifdef TRITON_ENABLE_CIG
   triton::common::TritonJson::Value parameters;
   if (model_config_.Find("parameters", &parameters)) {
     triton::common::TritonJson::Value value;
@@ -105,6 +110,8 @@ TensorRTModel::ParseModelConfig()
       LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "CiG Context pointer is set");
     }
   }
+#endif //TRITON_ENABLE_CIG
+
   return nullptr;  // Success
 }
 
diff --git a/src/tensorrt_model.h b/src/tensorrt_model.h
index 708a51a..27c1f2d 100644
--- a/src/tensorrt_model.h
+++ b/src/tensorrt_model.h
@@ -25,7 +25,9 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #pragma once
 
+#ifdef TRITON_ENABLE_CIG
 #include <cuda.h>
+#endif //TRITON_ENABLE_CIG
 
 #include "triton/backend/backend_model.h"
 
@@ -55,7 +57,7 @@ class TensorRTModel : public BackendModel {
   bool EagerBatching() { return eager_batching_; }
   bool BusyWaitEvents() { return busy_wait_events_; }
 
-
+#ifdef TRITON_ENABLE_CIG
   //! Following functions are related to CiG (Cuda in Graphics) context sharing
   //! for gaming use case. Creating a shared contexts reduces context switching
   //! overhead and leads to better performance of model execution along side
@@ -88,6 +90,7 @@ class TensorRTModel : public BackendModel {
     }
     return nullptr;
   }
+#endif //TRITON_ENABLE_CIG
 
  protected:
   common::TritonJson::Value graph_specs_;
@@ -97,9 +100,13 @@ class TensorRTModel : public BackendModel {
   bool separate_output_stream_;
   bool eager_batching_;
   bool busy_wait_events_;
+#ifdef TRITON_ENABLE_CIG
   CUcontext cig_ctx_;
+#endif //TRITON_ENABLE_CIG
+
 };
 
+#ifdef TRITON_ENABLE_CIG
 struct ScopedRuntimeCiGContext {
   ScopedRuntimeCiGContext(TensorRTModel* model_state)
       : model_state_(model_state)
@@ -116,5 +123,6 @@ struct ScopedRuntimeCiGContext {
   }
   TensorRTModel* model_state_;
 };
+#endif //TRITON_ENABLE_CIG
 
 }}}  // namespace triton::backend::tensorrt

From b624b98bc74782cd19e3ea85c9680baea6ae2813 Mon Sep 17 00:00:00 2001
From: Ashish Karale <akarale@nvidia.com>
Date: Tue, 24 Sep 2024 11:53:35 +0530
Subject: [PATCH 07/14] pre-commit fixes

---
 src/instance_state.cc |  6 +++---
 src/model_state.cc    | 17 ++++++++---------
 src/tensorrt.cc       |  6 +++---
 src/tensorrt_model.cc |  5 +++--
 src/tensorrt_model.h  |  9 ++++-----
 5 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/src/instance_state.cc b/src/instance_state.cc
index 4a4fbc1..9a735bc 100644
--- a/src/instance_state.cc
+++ b/src/instance_state.cc
@@ -260,7 +260,7 @@ ModelInstanceState::~ModelInstanceState()
 #ifdef TRITON_ENABLE_CIG
   // Set device if CiG is disabled
   if (!model_state_->isCiGEnabled())
-#endif //TRITON_ENABLE_CIG 
+#endif  // TRITON_ENABLE_CIG
   {
     cudaSetDevice(DeviceId());
   }
@@ -433,7 +433,7 @@ ModelInstanceState::Run(
 #ifdef TRITON_ENABLE_CIG
   // Set device if CiG is disabled
   if (!model_state_->isCiGEnabled())
-#endif //TRITON_ENABLE_CIG 
+#endif  // TRITON_ENABLE_CIG
   {
     cudaSetDevice(DeviceId());
   }
@@ -1566,7 +1566,7 @@ ModelInstanceState::InitStreamsAndEvents()
 #ifdef TRITON_ENABLE_CIG
   // Set device if CiG is disabled
   if (!model_state_->isCiGEnabled())
-#endif //TRITON_ENABLE_CIG 
+#endif  // TRITON_ENABLE_CIG
   {
     // Set the device before preparing the context.
     auto cuerr = cudaSetDevice(DeviceId());
diff --git a/src/model_state.cc b/src/model_state.cc
index 8b8b5d2..a2ab925 100644
--- a/src/model_state.cc
+++ b/src/model_state.cc
@@ -177,8 +177,8 @@ ModelState::~ModelState()
   for (auto& device_engine : device_engines_) {
 #ifdef TRITON_ENABLE_CIG
     // Set device if CiG is disabled
-    if (!isCiGEnabled()) 
-#endif //TRITON_ENABLE_CIG 
+    if (!isCiGEnabled())
+#endif  // TRITON_ENABLE_CIG
     {
       cudaSetDevice(device_engine.first.first);
     }
@@ -215,11 +215,10 @@ ModelState::CreateEngine(
   // We share the engine (for models that don't have dynamic shapes) and
   // runtime across instances that have access to the same GPU/NVDLA.
   if (eit->second.second == nullptr) {
-
 #ifdef TRITON_ENABLE_CIG
     // Set device if CiG is disabled
-    if (!isCiGEnabled()) 
-#endif //TRITON_ENABLE_CIG
+    if (!isCiGEnabled())
+#endif  // TRITON_ENABLE_CIG
     {
       auto cuerr = cudaSetDevice(gpu_device);
       if (cuerr != cudaSuccess) {
@@ -335,8 +334,8 @@ ModelState::AutoCompleteConfig()
 
 #ifdef TRITON_ENABLE_CIG
   // Set device if CiG is disabled
-  if (!isCiGEnabled()) 
-#endif //TRITON_ENABLE_CIG
+  if (!isCiGEnabled())
+#endif  // TRITON_ENABLE_CIG
   {
     cuerr = cudaSetDevice(device_id);
     if (cuerr != cudaSuccess) {
@@ -393,8 +392,8 @@ ModelState::AutoCompleteConfig()
 
 #ifdef TRITON_ENABLE_CIG
   // Set device if CiG is disabled
-  if (!isCiGEnabled()) 
-#endif //TRITON_ENABLE_CIG
+  if (!isCiGEnabled())
+#endif  // TRITON_ENABLE_CIG
   {
     cuerr = cudaSetDevice(current_device);
     if (cuerr != cudaSuccess) {
diff --git a/src/tensorrt.cc b/src/tensorrt.cc
index 6476313..d98a466 100644
--- a/src/tensorrt.cc
+++ b/src/tensorrt.cc
@@ -320,7 +320,7 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
 
 #ifdef TRITON_ENABLE_CIG
   ScopedRuntimeCiGContext cig_scope(model_state);
-#endif //TRITON_ENABLE_CIG
+#endif  // TRITON_ENABLE_CIG
 
   // With each instance we create a ModelInstanceState object and
   // associate it with the TRITONBACKEND_ModelInstance.
@@ -361,7 +361,7 @@ TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
   }
 #ifdef TRITON_ENABLE_CIG
   ScopedRuntimeCiGContext cig_scope(instance_state->StateForModel());
-#endif //TRITON_ENABLE_CIG
+#endif  // TRITON_ENABLE_CIG
 
   delete instance_state;
 
@@ -388,7 +388,7 @@ TRITONBACKEND_ModelInstanceExecute(
 
 #ifdef TRITON_ENABLE_CIG
   ScopedRuntimeCiGContext cig_scope(model_state);
-#endif //TRITON_ENABLE_CIG
+#endif  // TRITON_ENABLE_CIG
 
   // For TensorRT backend, the executing instance may not closely tie to
   // TRITONBACKEND_ModelInstance, the instance will be assigned based on
diff --git a/src/tensorrt_model.cc b/src/tensorrt_model.cc
index 71259e9..9339dcb 100644
--- a/src/tensorrt_model.cc
+++ b/src/tensorrt_model.cc
@@ -57,7 +57,8 @@ TensorRTModel::TensorRTModel(TRITONBACKEND_Model* triton_model)
       separate_output_stream_(false), eager_batching_(false),
       busy_wait_events_(false)
 #ifdef TRITON_ENABLE_CIG
-      ,cig_ctx_(nullptr)
+      ,
+      cig_ctx_(nullptr)
 #endif  // TRITON_ENABLE_CIG
 {
   ParseModelConfig();
@@ -110,7 +111,7 @@ TensorRTModel::ParseModelConfig()
       LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "CiG Context pointer is set");
     }
   }
-#endif //TRITON_ENABLE_CIG
+#endif  // TRITON_ENABLE_CIG
 
   return nullptr;  // Success
 }
diff --git a/src/tensorrt_model.h b/src/tensorrt_model.h
index 27c1f2d..1e40e61 100644
--- a/src/tensorrt_model.h
+++ b/src/tensorrt_model.h
@@ -27,7 +27,7 @@
 
 #ifdef TRITON_ENABLE_CIG
 #include <cuda.h>
-#endif //TRITON_ENABLE_CIG
+#endif  // TRITON_ENABLE_CIG
 
 #include "triton/backend/backend_model.h"
 
@@ -90,7 +90,7 @@ class TensorRTModel : public BackendModel {
     }
     return nullptr;
   }
-#endif //TRITON_ENABLE_CIG
+#endif  // TRITON_ENABLE_CIG
 
  protected:
   common::TritonJson::Value graph_specs_;
@@ -102,8 +102,7 @@ class TensorRTModel : public BackendModel {
   bool busy_wait_events_;
 #ifdef TRITON_ENABLE_CIG
   CUcontext cig_ctx_;
-#endif //TRITON_ENABLE_CIG
-
+#endif  // TRITON_ENABLE_CIG
 };
 
 #ifdef TRITON_ENABLE_CIG
@@ -123,6 +122,6 @@ struct ScopedRuntimeCiGContext {
   }
   TensorRTModel* model_state_;
 };
-#endif //TRITON_ENABLE_CIG
+#endif  // TRITON_ENABLE_CIG
 
 }}}  // namespace triton::backend::tensorrt

From 1f1ae7e750dc41c8fa6cfef031ca7775129f37cb Mon Sep 17 00:00:00 2001
From: Ashish Karale <akarale@nvidia.com>
Date: Wed, 30 Oct 2024 03:40:15 -0700
Subject: [PATCH 08/14] CiG->Cuda. Making the changes more generic to cuda
 context sharing + hidden ifdefs

---
 CMakeLists.txt        |  6 ++--
 src/instance_state.cc | 18 ++---------
 src/model_state.cc    | 24 +++-----------
 src/tensorrt.cc       | 13 +++-----
 src/tensorrt_model.cc | 21 ++++--------
 src/tensorrt_model.h  | 75 ++++++++++++++++++++++++++++---------------
 6 files changed, 69 insertions(+), 88 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c88248d..cd136cc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,7 +37,7 @@ set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard which feat
 option(TRITON_ENABLE_GPU "Enable GPU support in backend." ON)
 option(TRITON_ENABLE_STATS "Include statistics collections in backend." ON)
 option(TRITON_ENABLE_NVTX "Include nvtx markers collection in backend." OFF)
-option(TRITON_ENABLE_CIG "Enable Cuda in Graphics (CiG) support in backend." OFF)
+option(TRITON_ENABLE_CUDA_CTX_SHARING "Enable Cuda context sharing support in backend." OFF)
 
 set(TRITON_TENSORRT_LIB_PATHS "" CACHE PATH "Paths to TensorRT libraries. Multiple paths may be specified by separating them with a semicolon.")
 set(TRITON_TENSORRT_INCLUDE_PATHS "" CACHE PATH "Paths to TensorRT includes. Multiple paths may be specified by separating them with a semicolon.")
@@ -273,10 +273,10 @@ target_link_libraries(
       CUDA::cudart
 )
 
-if(${TRITON_ENABLE_CIG})
+if(${TRITON_ENABLE_CUDA_CTX_SHARING})
   target_compile_definitions(
       triton-tensorrt-backend
-      PRIVATE TRITON_ENABLE_CIG
+      PRIVATE TRITON_ENABLE_CUDA_CTX_SHARING
   )
   target_link_libraries(
       triton-tensorrt-backend
diff --git a/src/instance_state.cc b/src/instance_state.cc
index 9a735bc..06b38b8 100644
--- a/src/instance_state.cc
+++ b/src/instance_state.cc
@@ -257,11 +257,7 @@ ModelInstanceState::ModelInstanceState(
 
 ModelInstanceState::~ModelInstanceState()
 {
-#ifdef TRITON_ENABLE_CIG
-  // Set device if CiG is disabled
-  if (!model_state_->isCiGEnabled())
-#endif  // TRITON_ENABLE_CIG
-  {
+  if (!model_state_->isCudaContextSharingEnabled()) {
     cudaSetDevice(DeviceId());
   }
   for (auto& io_binding_infos : io_binding_infos_) {
@@ -430,11 +426,7 @@ ModelInstanceState::Run(
   payload_.reset(new Payload(next_set_, requests, request_count));
   SET_TIMESTAMP(payload_->compute_start_ns_);
 
-#ifdef TRITON_ENABLE_CIG
-  // Set device if CiG is disabled
-  if (!model_state_->isCiGEnabled())
-#endif  // TRITON_ENABLE_CIG
-  {
+  if (!model_state_->isCudaContextSharingEnabled()) {
     cudaSetDevice(DeviceId());
   }
 #ifdef TRITON_ENABLE_STATS
@@ -1563,11 +1555,7 @@ ModelInstanceState::EvaluateTensorRTContext(
 TRITONSERVER_Error*
 ModelInstanceState::InitStreamsAndEvents()
 {
-#ifdef TRITON_ENABLE_CIG
-  // Set device if CiG is disabled
-  if (!model_state_->isCiGEnabled())
-#endif  // TRITON_ENABLE_CIG
-  {
+  if (!model_state_->isCudaContextSharingEnabled()) {
     // Set the device before preparing the context.
     auto cuerr = cudaSetDevice(DeviceId());
     if (cuerr != cudaSuccess) {
diff --git a/src/model_state.cc b/src/model_state.cc
index a2ab925..5912725 100644
--- a/src/model_state.cc
+++ b/src/model_state.cc
@@ -175,11 +175,7 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model)
 ModelState::~ModelState()
 {
   for (auto& device_engine : device_engines_) {
-#ifdef TRITON_ENABLE_CIG
-    // Set device if CiG is disabled
-    if (!isCiGEnabled())
-#endif  // TRITON_ENABLE_CIG
-    {
+    if (!isCudaContextSharingEnabled()) {
       cudaSetDevice(device_engine.first.first);
     }
     auto& runtime = device_engine.second.first;
@@ -215,11 +211,7 @@ ModelState::CreateEngine(
   // We share the engine (for models that don't have dynamic shapes) and
   // runtime across instances that have access to the same GPU/NVDLA.
   if (eit->second.second == nullptr) {
-#ifdef TRITON_ENABLE_CIG
-    // Set device if CiG is disabled
-    if (!isCiGEnabled())
-#endif  // TRITON_ENABLE_CIG
-    {
+    if (!isCudaContextSharingEnabled()) {
       auto cuerr = cudaSetDevice(gpu_device);
       if (cuerr != cudaSuccess) {
         return TRITONSERVER_ErrorNew(
@@ -332,11 +324,7 @@ ModelState::AutoCompleteConfig()
            " to auto-complete config for " + Name())
            .c_str()));
 
-#ifdef TRITON_ENABLE_CIG
-  // Set device if CiG is disabled
-  if (!isCiGEnabled())
-#endif  // TRITON_ENABLE_CIG
-  {
+  if (!isCudaContextSharingEnabled()) {
     cuerr = cudaSetDevice(device_id);
     if (cuerr != cudaSuccess) {
       return TRITONSERVER_ErrorNew(
@@ -390,11 +378,7 @@ ModelState::AutoCompleteConfig()
 
   RETURN_IF_ERROR(AutoCompleteConfigHelper(model_path));
 
-#ifdef TRITON_ENABLE_CIG
-  // Set device if CiG is disabled
-  if (!isCiGEnabled())
-#endif  // TRITON_ENABLE_CIG
-  {
+  if (!isCudaContextSharingEnabled()) {
     cuerr = cudaSetDevice(current_device);
     if (cuerr != cudaSuccess) {
       return TRITONSERVER_ErrorNew(
diff --git a/src/tensorrt.cc b/src/tensorrt.cc
index d98a466..cdc29b3 100644
--- a/src/tensorrt.cc
+++ b/src/tensorrt.cc
@@ -318,9 +318,7 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
     DeviceMemoryTracker::TrackThreadMemoryUsage(lusage.get());
   }
 
-#ifdef TRITON_ENABLE_CIG
-  ScopedRuntimeCiGContext cig_scope(model_state);
-#endif  // TRITON_ENABLE_CIG
+  ScopedRuntimeCudaContext cig_scope(model_state);
 
   // With each instance we create a ModelInstanceState object and
   // associate it with the TRITONBACKEND_ModelInstance.
@@ -359,9 +357,8 @@ TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
   if (!instance_state) {
     return nullptr;
   }
-#ifdef TRITON_ENABLE_CIG
-  ScopedRuntimeCiGContext cig_scope(instance_state->StateForModel());
-#endif  // TRITON_ENABLE_CIG
+
+  ScopedRuntimeCudaContext cig_scope(instance_state->StateForModel());
 
   delete instance_state;
 
@@ -386,9 +383,7 @@ TRITONBACKEND_ModelInstanceExecute(
       instance, reinterpret_cast<void**>(&instance_state)));
   ModelState* model_state = instance_state->StateForModel();
 
-#ifdef TRITON_ENABLE_CIG
-  ScopedRuntimeCiGContext cig_scope(model_state);
-#endif  // TRITON_ENABLE_CIG
+  ScopedRuntimeCudaContext cig_scope(model_state);
 
   // For TensorRT backend, the executing instance may not closely tie to
   // TRITONBACKEND_ModelInstance, the instance will be assigned based on
diff --git a/src/tensorrt_model.cc b/src/tensorrt_model.cc
index 9339dcb..05b7995 100644
--- a/src/tensorrt_model.cc
+++ b/src/tensorrt_model.cc
@@ -26,8 +26,6 @@
 
 #include "tensorrt_model.h"
 
-#include <sstream>
-
 namespace triton { namespace backend { namespace tensorrt {
 
 TensorRTModel::Priority
@@ -56,10 +54,6 @@ TensorRTModel::TensorRTModel(TRITONBACKEND_Model* triton_model)
       use_cuda_graphs_(false), gather_kernel_buffer_threshold_(0),
       separate_output_stream_(false), eager_batching_(false),
       busy_wait_events_(false)
-#ifdef TRITON_ENABLE_CIG
-      ,
-      cig_ctx_(nullptr)
-#endif  // TRITON_ENABLE_CIG
 {
   ParseModelConfig();
 }
@@ -96,22 +90,19 @@ TensorRTModel::ParseModelConfig()
     }
   }
 
-#ifdef TRITON_ENABLE_CIG
+// TODO Ashish
+#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
   triton::common::TritonJson::Value parameters;
   if (model_config_.Find("parameters", &parameters)) {
     triton::common::TritonJson::Value value;
     std::string ptr_value;
-    if (parameters.Find("CIG_CONTEXT_PTR", &value)) {
+    if (parameters.Find("CUDA_CONTEXT_PTR", &value)) {
       RETURN_IF_ERROR(value.MemberAsString("string_value", &ptr_value));
-      std::stringstream ss;
-      ss << ptr_value;
-      void* ctx_ptr;
-      ss >> ctx_ptr;
-      cig_ctx_ = static_cast<CUcontext>(ctx_ptr);
-      LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "CiG Context pointer is set");
+      cuda_ctx = static_cast<CUcontext>(StringToPointer(ptr_value));
+      LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "Cuda Context pointer is set");
     }
   }
-#endif  // TRITON_ENABLE_CIG
+#endif  // TRITON_ENABLE_CUDA_CTX_SHARING
 
   return nullptr;  // Success
 }
diff --git a/src/tensorrt_model.h b/src/tensorrt_model.h
index 1e40e61..2de730e 100644
--- a/src/tensorrt_model.h
+++ b/src/tensorrt_model.h
@@ -25,9 +25,10 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #pragma once
 
-#ifdef TRITON_ENABLE_CIG
+#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
 #include <cuda.h>
-#endif  // TRITON_ENABLE_CIG
+#endif  // TRITON_ENABLE_CUDA_CTX_SHARING
+#include <sstream>
 
 #include "triton/backend/backend_model.h"
 
@@ -57,40 +58,60 @@ class TensorRTModel : public BackendModel {
   bool EagerBatching() { return eager_batching_; }
   bool BusyWaitEvents() { return busy_wait_events_; }
 
-#ifdef TRITON_ENABLE_CIG
-  //! Following functions are related to CiG (Cuda in Graphics) context sharing
+  void* StringToPointer(std::string& str)
+  {
+    std::stringstream ss;
+    ss << str;
+
+    void* ctx_ptr;
+    ss >> ctx_ptr;
+    return ctx_ptr;
+  }
+
+  //! Following functions are related to Cuda (Cuda in Graphics) context sharing
   //! for gaming use case. Creating a shared contexts reduces context switching
   //! overhead and leads to better performance of model execution along side
   //! Graphics workload.
-  CUcontext GetCiGContext() { return cig_ctx_; }
-  bool isCiGEnabled() { return cig_ctx_ != nullptr; }
 
-  inline TRITONSERVER_Error* PushCiGContext()
+  bool isCudaContextSharingEnabled()
+  {
+#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
+    return cuda_ctx != nullptr;
+#else
+    return false;
+#endif  // TRITON_ENABLE_CUDA_CTX_SHARING
+  }
+
+  inline TRITONSERVER_Error* PushCudaContext()
   {
-    if (CUDA_SUCCESS != cuCtxPushCurrent(cig_ctx_)) {
+#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
+    if (CUDA_SUCCESS != cuCtxPushCurrent(cuda_ctx)) {
       return TRITONSERVER_ErrorNew(
           TRITONSERVER_ERROR_INTERNAL,
-          (std::string("unable to push CiG context for ") + Name()).c_str());
+          (std::string("unable to push Cuda context for ") + Name()).c_str());
     }
+#endif  // TRITON_ENABLE_CUDA_CTX_SHARING
     return nullptr;
   }
 
-  inline TRITONSERVER_Error* PopCiGContext()
+  inline TRITONSERVER_Error* PopCudaContext()
   {
+#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
     CUcontext oldCtx{};
     if (CUDA_SUCCESS != cuCtxPopCurrent(&oldCtx)) {
       return TRITONSERVER_ErrorNew(
           TRITONSERVER_ERROR_INTERNAL,
-          (std::string("unable to [pop CiG context for ") + Name()).c_str());
+          (std::string("unable to pop Cuda context for ") + Name()).c_str());
     }
-    if (oldCtx != cig_ctx_) {
+    if (oldCtx != cuda_ctx) {
       return TRITONSERVER_ErrorNew(
           TRITONSERVER_ERROR_INTERNAL,
-          (std::string("popping the wrong CiG context for ") + Name()).c_str());
+          (std::string("popping the wrong Cuda context for ") + Name())
+              .c_str());
     }
+#endif  // TRITON_ENABLE_CUDA_CTX_SHARING
     return nullptr;
   }
-#endif  // TRITON_ENABLE_CIG
 
  protected:
   common::TritonJson::Value graph_specs_;
@@ -100,28 +121,30 @@ class TensorRTModel : public BackendModel {
   bool separate_output_stream_;
   bool eager_batching_;
   bool busy_wait_events_;
-#ifdef TRITON_ENABLE_CIG
-  CUcontext cig_ctx_;
-#endif  // TRITON_ENABLE_CIG
+#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
+  CUcontext cuda_ctx = nullptr;
+#endif  // TRITON_ENABLE_CUDA_CTX_SHARING
 };
 
-#ifdef TRITON_ENABLE_CIG
-struct ScopedRuntimeCiGContext {
-  ScopedRuntimeCiGContext(TensorRTModel* model_state)
+struct ScopedRuntimeCudaContext {
+  ScopedRuntimeCudaContext(TensorRTModel* model_state)
       : model_state_(model_state)
   {
-    if (model_state_->isCiGEnabled()) {
-      THROW_IF_BACKEND_MODEL_ERROR(model_state_->PushCiGContext());
+#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
+    if (model_state_->isCudaContextSharingEnabled()) {
+      THROW_IF_BACKEND_MODEL_ERROR(model_state_->PushCudaContext());
     }
+#endif  // TRITON_ENABLE_CUDA_CTX_SHARING
   }
-  ~ScopedRuntimeCiGContext()
+  ~ScopedRuntimeCudaContext()
   {
-    if (model_state_->isCiGEnabled()) {
-      THROW_IF_BACKEND_MODEL_ERROR(model_state_->PopCiGContext());
+#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
+    if (model_state_->isCudaContextSharingEnabled()) {
+      THROW_IF_BACKEND_MODEL_ERROR(model_state_->PopCudaContext());
     }
+#endif  // TRITON_ENABLE_CUDA_CTX_SHARING
   }
   TensorRTModel* model_state_;
 };
-#endif  // TRITON_ENABLE_CIG
 
 }}}  // namespace triton::backend::tensorrt

From b42a8d6538a6629bacb937f59c7e86345cf74cba Mon Sep 17 00:00:00 2001
From: Ashish Karale <akarale@nvidia.com>
Date: Wed, 30 Oct 2024 03:47:23 -0700
Subject: [PATCH 09/14] remove todo

---
 src/tensorrt_model.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/tensorrt_model.cc b/src/tensorrt_model.cc
index 05b7995..42fdc39 100644
--- a/src/tensorrt_model.cc
+++ b/src/tensorrt_model.cc
@@ -90,7 +90,6 @@ TensorRTModel::ParseModelConfig()
     }
   }
 
-// TODO Ashish
 #ifdef TRITON_ENABLE_CUDA_CTX_SHARING
   triton::common::TritonJson::Value parameters;
   if (model_config_.Find("parameters", &parameters)) {

From d9aff26b2c0f1094106c60986b0433f980c77307 Mon Sep 17 00:00:00 2001
From: Ashish Karale <akarale@nvidia.com>
Date: Wed, 30 Oct 2024 04:25:10 -0700
Subject: [PATCH 10/14] Add GetParameter to fetch string params

---
 src/tensorrt_model.cc | 39 +++++++++++++++++++++++++++++----------
 src/tensorrt_model.h  | 12 ++++++++++++
 2 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/src/tensorrt_model.cc b/src/tensorrt_model.cc
index 42fdc39..bd419d5 100644
--- a/src/tensorrt_model.cc
+++ b/src/tensorrt_model.cc
@@ -91,16 +91,11 @@ TensorRTModel::ParseModelConfig()
   }
 
 #ifdef TRITON_ENABLE_CUDA_CTX_SHARING
-  triton::common::TritonJson::Value parameters;
-  if (model_config_.Find("parameters", &parameters)) {
-    triton::common::TritonJson::Value value;
-    std::string ptr_value;
-    if (parameters.Find("CUDA_CONTEXT_PTR", &value)) {
-      RETURN_IF_ERROR(value.MemberAsString("string_value", &ptr_value));
-      cuda_ctx = static_cast<CUcontext>(StringToPointer(ptr_value));
-      LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "Cuda Context pointer is set");
-    }
-  }
+  std::string ptr_str = "";
+  RETURN_IF_ERROR(GetParameter("CUDA_CONTEXT_PTR", ptr_str));
+  cuda_ctx = static_cast<CUcontext>(StringToPointer(ptr_str));
+  // cuda_ctx = static_cast<CUcontext>(reinterpret_cast<void*>(ptr_str));
+  LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, "Cuda Context pointer is set");
 #endif  // TRITON_ENABLE_CUDA_CTX_SHARING
 
   return nullptr;  // Success
@@ -133,4 +128,28 @@ TensorRTModel::GetCudaStreamPriority()
   return cuda_stream_priority;
 }
 
+template <>
+TRITONSERVER_Error*
+TensorRTModel::GetParameter<std::string>(
+    std::string const& name, std::string& str_value)
+{
+  triton::common::TritonJson::Value parameters;
+  TRITONSERVER_Error* err =
+      model_config_.MemberAsObject("parameters", &parameters);
+  if (err != nullptr) {
+    return err;
+    // throw std::runtime_error("Model config doesn't have a parameters
+    // section");
+  }
+  triton::common::TritonJson::Value value;
+  err = parameters.MemberAsObject(name.c_str(), &value);
+  if (err != nullptr) {
+    return err;
+    // std::string errStr = "Cannot find parameter with name: " + name;
+    // throw std::runtime_error(errStr);
+  }
+  value.MemberAsString("string_value", &str_value);
+  return nullptr;
+}
+
 }}}  // namespace triton::backend::tensorrt
diff --git a/src/tensorrt_model.h b/src/tensorrt_model.h
index 2de730e..7e5fe92 100644
--- a/src/tensorrt_model.h
+++ b/src/tensorrt_model.h
@@ -39,6 +39,14 @@ class TensorRTModel : public BackendModel {
   TensorRTModel(TRITONBACKEND_Model* triton_model);
   virtual ~TensorRTModel() = default;
 
+  template <typename T>
+  TRITONSERVER_Error* GetParameter(std::string const& name, T& value)
+  {
+    assert(false);
+    auto dummy = T();
+    return dummy;
+  }
+
   TRITONSERVER_Error* SetTensorRTModelConfig();
 
   TRITONSERVER_Error* ParseModelConfig();
@@ -58,6 +66,10 @@ class TensorRTModel : public BackendModel {
   bool EagerBatching() { return eager_batching_; }
   bool BusyWaitEvents() { return busy_wait_events_; }
 
+  template <>
+  TRITONSERVER_Error* GetParameter<std::string>(
+      std::string const& name, std::string& str_value);
+
   void* StringToPointer(std::string& str)
   {
     std::stringstream ss;

From 2ca19acf3a4cf00d167b785b8b39f6a68ef0229d Mon Sep 17 00:00:00 2001
From: Ashish Karale <akarale@nvidia.com>
Date: Mon, 6 Jan 2025 00:37:44 -0800
Subject: [PATCH 11/14] Cig->Cuda + comment updates

---
 src/tensorrt.cc      | 6 +++---
 src/tensorrt_model.h | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/tensorrt.cc b/src/tensorrt.cc
index cdc29b3..747b867 100644
--- a/src/tensorrt.cc
+++ b/src/tensorrt.cc
@@ -318,7 +318,7 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
     DeviceMemoryTracker::TrackThreadMemoryUsage(lusage.get());
   }
 
-  ScopedRuntimeCudaContext cig_scope(model_state);
+  ScopedRuntimeCudaContext cuda_scope(model_state);
 
   // With each instance we create a ModelInstanceState object and
   // associate it with the TRITONBACKEND_ModelInstance.
@@ -358,7 +358,7 @@ TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
     return nullptr;
   }
 
-  ScopedRuntimeCudaContext cig_scope(instance_state->StateForModel());
+  ScopedRuntimeCudaContext cuda_scope(instance_state->StateForModel());
 
   delete instance_state;
 
@@ -383,7 +383,7 @@ TRITONBACKEND_ModelInstanceExecute(
       instance, reinterpret_cast<void**>(&instance_state)));
   ModelState* model_state = instance_state->StateForModel();
 
-  ScopedRuntimeCudaContext cig_scope(model_state);
+  ScopedRuntimeCudaContext cuda_scope(model_state);
 
   // For TensorRT backend, the executing instance may not closely tie to
   // TRITONBACKEND_ModelInstance, the instance will be assigned based on
diff --git a/src/tensorrt_model.h b/src/tensorrt_model.h
index 7e5fe92..413e808 100644
--- a/src/tensorrt_model.h
+++ b/src/tensorrt_model.h
@@ -80,10 +80,10 @@ class TensorRTModel : public BackendModel {
     return ctx_ptr;
   }
 
-  //! Following functions are related to Cuda (Cuda in Graphics) context sharing
-  //! for gaming use case. Creating a shared contexts reduces context switching
-  //! overhead and leads to better performance of model execution along side
-  //! Graphics workload.
+  //! Following functions are related to custom Cuda context (Cuda in Graphics)
+  //! sharing for gaming use case. Creating a shared contexts reduces context
+  //! switching overhead and leads to better performance of model execution
+  //! along side Graphics workload.
 
   bool isCudaContextSharingEnabled()
   {

From 20283c6d76eec9d6035ea71e4dd51d8f78a29c9d Mon Sep 17 00:00:00 2001
From: Ashish Karale <akarale@nvidia.com>
Date: Mon, 6 Jan 2025 00:38:44 -0800
Subject: [PATCH 12/14] Use RETURN_IF_ERROR macro

---
 src/tensorrt_model.cc | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/src/tensorrt_model.cc b/src/tensorrt_model.cc
index bd419d5..b466bfa 100644
--- a/src/tensorrt_model.cc
+++ b/src/tensorrt_model.cc
@@ -134,20 +134,11 @@ TensorRTModel::GetParameter<std::string>(
     std::string const& name, std::string& str_value)
 {
   triton::common::TritonJson::Value parameters;
-  TRITONSERVER_Error* err =
-      model_config_.MemberAsObject("parameters", &parameters);
-  if (err != nullptr) {
-    return err;
-    // throw std::runtime_error("Model config doesn't have a parameters
-    // section");
-  }
+  RETURN_IF_ERROR(model_config_.MemberAsObject("parameters", &parameters));
+
   triton::common::TritonJson::Value value;
-  err = parameters.MemberAsObject(name.c_str(), &value);
-  if (err != nullptr) {
-    return err;
-    // std::string errStr = "Cannot find parameter with name: " + name;
-    // throw std::runtime_error(errStr);
-  }
+  RETURN_IF_ERROR(parameters.MemberAsObject(name.c_str(), &value));
+
   value.MemberAsString("string_value", &str_value);
   return nullptr;
 }

From e2a9336d9ceaf3034bed0c73cd86af2b80115baf Mon Sep 17 00:00:00 2001
From: Ashish Karale <akarale@nvidia.com>
Date: Mon, 27 Jan 2025 08:07:03 -0800
Subject: [PATCH 13/14] Handle Multi-GPU failure case

---
 src/model_state.cc | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/src/model_state.cc b/src/model_state.cc
index 5912725..b337c63 100644
--- a/src/model_state.cc
+++ b/src/model_state.cc
@@ -324,15 +324,25 @@ ModelState::AutoCompleteConfig()
            " to auto-complete config for " + Name())
            .c_str()));
 
-  if (!isCudaContextSharingEnabled()) {
-    cuerr = cudaSetDevice(device_id);
-    if (cuerr != cudaSuccess) {
-      return TRITONSERVER_ErrorNew(
-          TRITONSERVER_ERROR_INTERNAL,
-          (std::string("unable to set CUDA device to GPU ") +
-           std::to_string(device_id) + " : " + cudaGetErrorString(cuerr))
-              .c_str());
-    }
+#ifdef TRITON_ENABLE_CUDA_CTX_SHARING
+  // Return failure if Cuda context sharing is enabled and
+  // if it is a multi GPU setup
+  if (isCudaContextSharingEnabled() && device_id != 0) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        (std::string(
+             "Cuda context sharing is not supported on mult-GPU system."))
+            .c_str());
+  }
+#endif  // TRITON_ENABLE_CUDA_CTX_SHARING
+
+  cuerr = cudaSetDevice(device_id);
+  if (cuerr != cudaSuccess) {
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        (std::string("unable to set CUDA device to GPU ") +
+         std::to_string(device_id) + " : " + cudaGetErrorString(cuerr))
+            .c_str());
   }
 
   std::string artifact_name;

From 625d19e05740eb44e5359828d85a88a8129d11be Mon Sep 17 00:00:00 2001
From: Ashish Karale <akarale@nvidia.com>
Date: Tue, 28 Jan 2025 02:37:43 -0800
Subject: [PATCH 14/14] typo + styling fixes

---
 src/instance_state.cc |  6 +++---
 src/model_state.cc    | 10 +++++-----
 src/tensorrt_model.h  |  6 +++---
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/instance_state.cc b/src/instance_state.cc
index 06b38b8..1e0517b 100644
--- a/src/instance_state.cc
+++ b/src/instance_state.cc
@@ -257,7 +257,7 @@ ModelInstanceState::ModelInstanceState(
 
 ModelInstanceState::~ModelInstanceState()
 {
-  if (!model_state_->isCudaContextSharingEnabled()) {
+  if (!model_state_->IsCudaContextSharingEnabled()) {
     cudaSetDevice(DeviceId());
   }
   for (auto& io_binding_infos : io_binding_infos_) {
@@ -426,7 +426,7 @@ ModelInstanceState::Run(
   payload_.reset(new Payload(next_set_, requests, request_count));
   SET_TIMESTAMP(payload_->compute_start_ns_);
 
-  if (!model_state_->isCudaContextSharingEnabled()) {
+  if (!model_state_->IsCudaContextSharingEnabled()) {
     cudaSetDevice(DeviceId());
   }
 #ifdef TRITON_ENABLE_STATS
@@ -1555,7 +1555,7 @@ ModelInstanceState::EvaluateTensorRTContext(
 TRITONSERVER_Error*
 ModelInstanceState::InitStreamsAndEvents()
 {
-  if (!model_state_->isCudaContextSharingEnabled()) {
+  if (!model_state_->IsCudaContextSharingEnabled()) {
     // Set the device before preparing the context.
     auto cuerr = cudaSetDevice(DeviceId());
     if (cuerr != cudaSuccess) {
diff --git a/src/model_state.cc b/src/model_state.cc
index b337c63..9d547c1 100644
--- a/src/model_state.cc
+++ b/src/model_state.cc
@@ -175,7 +175,7 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model)
 ModelState::~ModelState()
 {
   for (auto& device_engine : device_engines_) {
-    if (!isCudaContextSharingEnabled()) {
+    if (!IsCudaContextSharingEnabled()) {
       cudaSetDevice(device_engine.first.first);
     }
     auto& runtime = device_engine.second.first;
@@ -211,7 +211,7 @@ ModelState::CreateEngine(
   // We share the engine (for models that don't have dynamic shapes) and
   // runtime across instances that have access to the same GPU/NVDLA.
   if (eit->second.second == nullptr) {
-    if (!isCudaContextSharingEnabled()) {
+    if (!IsCudaContextSharingEnabled()) {
       auto cuerr = cudaSetDevice(gpu_device);
       if (cuerr != cudaSuccess) {
         return TRITONSERVER_ErrorNew(
@@ -327,11 +327,11 @@ ModelState::AutoCompleteConfig()
 #ifdef TRITON_ENABLE_CUDA_CTX_SHARING
   // Return failure if Cuda context sharing is enabled and
   // if it is a multi GPU setup
-  if (isCudaContextSharingEnabled() && device_id != 0) {
+  if (IsCudaContextSharingEnabled() && device_id != 0) {
     return TRITONSERVER_ErrorNew(
         TRITONSERVER_ERROR_INTERNAL,
         (std::string(
-             "Cuda context sharing is not supported on mult-GPU system."))
+             "Cuda context sharing is not supported on multi-GPU system."))
             .c_str());
   }
 #endif  // TRITON_ENABLE_CUDA_CTX_SHARING
@@ -388,7 +388,7 @@ ModelState::AutoCompleteConfig()
 
   RETURN_IF_ERROR(AutoCompleteConfigHelper(model_path));
 
-  if (!isCudaContextSharingEnabled()) {
+  if (!IsCudaContextSharingEnabled()) {
     cuerr = cudaSetDevice(current_device);
     if (cuerr != cudaSuccess) {
       return TRITONSERVER_ErrorNew(
diff --git a/src/tensorrt_model.h b/src/tensorrt_model.h
index 413e808..bbc2329 100644
--- a/src/tensorrt_model.h
+++ b/src/tensorrt_model.h
@@ -85,7 +85,7 @@ class TensorRTModel : public BackendModel {
   //! switching overhead and leads to better performance of model execution
   //! along side Graphics workload.
 
-  bool isCudaContextSharingEnabled()
+  bool IsCudaContextSharingEnabled()
   {
 #ifdef TRITON_ENABLE_CUDA_CTX_SHARING
     return cuda_ctx != nullptr;
@@ -143,7 +143,7 @@ struct ScopedRuntimeCudaContext {
       : model_state_(model_state)
   {
 #ifdef TRITON_ENABLE_CUDA_CTX_SHARING
-    if (model_state_->isCudaContextSharingEnabled()) {
+    if (model_state_->IsCudaContextSharingEnabled()) {
       THROW_IF_BACKEND_MODEL_ERROR(model_state_->PushCudaContext());
     }
 #endif  // TRITON_ENABLE_CUDA_CTX_SHARING
@@ -151,7 +151,7 @@ struct ScopedRuntimeCudaContext {
   ~ScopedRuntimeCudaContext()
   {
 #ifdef TRITON_ENABLE_CUDA_CTX_SHARING
-    if (model_state_->isCudaContextSharingEnabled()) {
+    if (model_state_->IsCudaContextSharingEnabled()) {
       THROW_IF_BACKEND_MODEL_ERROR(model_state_->PopCudaContext());
     }
 #endif  // TRITON_ENABLE_CUDA_CTX_SHARING