From 7a7e4abc4dd48284df652dad770a2622788e958c Mon Sep 17 00:00:00 2001 From: Bryce Dubayah Date: Tue, 27 Aug 2024 15:00:25 -0700 Subject: [PATCH] BT-11899 check for kv cache reuse (#1099) --- pyproject.toml | 2 +- truss/constants.py | 2 +- truss/templates/trtllm-briton/src/engine.py | 4 ++++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0d7037539..bcf3b08cd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "truss" -version = "0.9.30rc4" +version = "0.9.30rc5" description = "A seamless bridge from model development to model delivery" license = "MIT" readme = "README.md" diff --git a/truss/constants.py b/truss/constants.py index 1bc897035..3ec6d830b 100644 --- a/truss/constants.py +++ b/truss/constants.py @@ -109,7 +109,7 @@ "grpcio==1.64.0", "grpcio-tools==1.64.0", "transformers==4.43.2", - "truss==0.9.30rc1", + "truss==0.9.30rc3", "outlines==0.0.46", "torch==2.4.0", ] diff --git a/truss/templates/trtllm-briton/src/engine.py b/truss/templates/trtllm-briton/src/engine.py index 26570d7fd..014ab7b26 100644 --- a/truss/templates/trtllm-briton/src/engine.py +++ b/truss/templates/trtllm-briton/src/engine.py @@ -90,6 +90,9 @@ def __init__(self, **kwargs): self._kv_cache_free_gpu_mem_fraction = ( truss_trtllm_build_config.kv_cache_free_gpu_mem_fraction ) + self._enable_kv_cache_reuse = ( + truss_trtllm_build_config.plugin_configuration.use_paged_context_fmha + ) self._hf_token = None try: @@ -115,6 +118,7 @@ def load(self): engine_path: "{self._data_dir.resolve()}" hf_tokenizer: "{self._tokenizer_repository}" kv_cache_free_gpu_mem_fraction: {self._kv_cache_free_gpu_mem_fraction} + enable_kv_cache_reuse: {"true" if self._enable_kv_cache_reuse else "false"} fsm_cache_dir: "{FSM_CACHE_DIR}" """ config_pbtxt_path = (self._data_dir / "briton_config.pbtxt").resolve()