add LLAMA_CPP_N_CTX env variable to control maximum context length

holunda-io · Nov 12, 2024 · 7e4a0f1 · 7e4a0f1
1 parent c1bddde
commit 7e4a0f1
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 3 deletions.
diff --git a/bpm_ai_inference/llm/llama_cpp/llama_chat.py b/bpm_ai_inference/llm/llama_cpp/llama_chat.py
@@ -15,7 +15,7 @@
 from bpm_ai_inference.llm.llama_cpp._constants import DEFAULT_MODEL, DEFAULT_TEMPERATURE, DEFAULT_MAX_RETRIES, \
     DEFAULT_QUANT_BALANCED
 from bpm_ai_inference.llm.llama_cpp.util import messages_to_llama_dicts
-from bpm_ai_inference.util import FORCE_OFFLINE_FLAG
+from bpm_ai_inference.util import FORCE_OFFLINE_FLAG, LLAMA_CPP_N_CTX
 from bpm_ai_inference.util.files import find_file
 from bpm_ai_inference.util.hf import hf_home
 
@@ -47,6 +47,7 @@ def __init__(
         temperature: float = DEFAULT_TEMPERATURE,
         grammar: str = None,
         max_retries: int = DEFAULT_MAX_RETRIES,
+        n_ctx: int = int(os.getenv(LLAMA_CPP_N_CTX, "4096")),
         force_offline: bool = (os.getenv(FORCE_OFFLINE_FLAG, "false").lower() == "true")
     ):
         if not has_llama_cpp_python:
@@ -58,7 +59,6 @@ def __init__(
             retryable_exceptions=[]
         )
         self.grammar = grammar
-        n_ctx = 4096
         if force_offline:
             model_file = find_file(hf_home() + "hub/models--" + model.replace("/", "--"), filename)
             self.llm = Llama(

diff --git a/bpm_ai_inference/util/__init__.py b/bpm_ai_inference/util/__init__.py
@@ -1 +1,2 @@
-FORCE_OFFLINE_FLAG = "FORCE_OFFLINE"
+FORCE_OFFLINE_FLAG = "FORCE_OFFLINE"
+LLAMA_CPP_N_CTX = "LLAMA_CPP_N_CTX"