Merge remote-tracking branch 'upstream/main' into add_clickhouse_vect…

…ore_store # Conflicts: # poetry.lock # pyproject.toml
zylon-ai · Jul 8, 2024 · 48f8aaf · 48f8aaf
2 parents 1775cf6 + b687dc8
commit 48f8aaf
Show file tree

Hide file tree

Showing 13 changed files with 2,258 additions and 2,169 deletions.
diff --git a/fern/docs/pages/api-reference/sdks.mdx b/fern/docs/pages/api-reference/sdks.mdx
@@ -8,14 +8,14 @@ The clients are kept up to date automatically, so we encourage you to use the la
 
 <Cards>
   <Card
-    title="Node.js/TypeScript - WIP"
+    title="TypeScript"
     icon="fa-brands fa-node"
-    href="https://github.com/imartinez/privateGPT-typescript"
+    href="https://github.com/zylon-ai/privategpt-ts"
   />
   <Card
-    title="Python - Ready!"
+    title="Python"
     icon="fa-brands fa-python"
-    href="https://github.com/imartinez/pgpt_python"
+    href="https://github.com/zylon-ai/pgpt-python"
   />
   <br />
 </Cards>

diff --git a/poetry.lock b/poetry.lock
diff --git a/private_gpt/components/embedding/embedding_component.py b/private_gpt/components/embedding/embedding_component.py
@@ -55,8 +55,17 @@ def __init__(self, settings: Settings) -> None:
                         "OpenAI dependencies not found, install with `poetry install --extras embeddings-openai`"
                     ) from e
 
-                openai_settings = settings.openai.api_key
-                self.embedding_model = OpenAIEmbedding(api_key=openai_settings)
+                api_base = (
+                    settings.openai.embedding_api_base or settings.openai.api_base
+                )
+                api_key = settings.openai.embedding_api_key or settings.openai.api_key
+                model = settings.openai.embedding_model
+
+                self.embedding_model = OpenAIEmbedding(
+                    api_base=api_base,
+                    api_key=api_key,
+                    model=model,
+                )
             case "ollama":
                 try:
                     from llama_index.embeddings.ollama import (  # type: ignore

diff --git a/private_gpt/components/llm/custom/sagemaker.py b/private_gpt/components/llm/custom/sagemaker.py
@@ -218,7 +218,7 @@ def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
 
         response_body = resp["Body"]
         response_str = response_body.read().decode("utf-8")
-        response_dict = eval(response_str)
+        response_dict = json.loads(response_str)
 
         return CompletionResponse(
             text=response_dict[0]["generated_text"][len(prompt) :], raw=resp

diff --git a/private_gpt/components/llm/llm_component.py b/private_gpt/components/llm/llm_component.py
@@ -51,7 +51,7 @@ def __init__(self, settings: Settings) -> None:
                         "Local dependencies not found, install with `poetry install --extras llms-llama-cpp`"
                     ) from e
 
-                prompt_style = get_prompt_style(settings.llamacpp.prompt_style)
+                prompt_style = get_prompt_style(settings.llm.prompt_style)
                 settings_kwargs = {
                     "tfs_z": settings.llamacpp.tfs_z,  # ollama and llama-cpp
                     "top_k": settings.llamacpp.top_k,  # ollama and llama-cpp
@@ -109,15 +109,23 @@ def __init__(self, settings: Settings) -> None:
                     raise ImportError(
                         "OpenAILike dependencies not found, install with `poetry install --extras llms-openai-like`"
                     ) from e
-
+                prompt_style = get_prompt_style(settings.llm.prompt_style)
                 openai_settings = settings.openai
                 self.llm = OpenAILike(
                     api_base=openai_settings.api_base,
                     api_key=openai_settings.api_key,
                     model=openai_settings.model,
                     is_chat_model=True,
-                    max_tokens=None,
+                    max_tokens=settings.llm.max_new_tokens,
                     api_version="",
+                    temperature=settings.llm.temperature,
+                    context_window=settings.llm.context_window,
+                    max_new_tokens=settings.llm.max_new_tokens,
+                    messages_to_prompt=prompt_style.messages_to_prompt,
+                    completion_to_prompt=prompt_style.completion_to_prompt,
+                    tokenizer=settings.llm.tokenizer,
+                    timeout=openai_settings.request_timeout,
+                    reuse_client=False,
                 )
             case "ollama":
                 try:

diff --git a/private_gpt/components/llm/prompt_helper.py b/private_gpt/components/llm/prompt_helper.py
@@ -173,18 +173,22 @@ def _completion_to_prompt(self, completion: str) -> str:
 
 class MistralPromptStyle(AbstractPromptStyle):
     def _messages_to_prompt(self, messages: Sequence[ChatMessage]) -> str:
-        prompt = "<s>"
+        inst_buffer = []
+        text = ""
         for message in messages:
-            role = message.role
-            content = message.content or ""
-            if role.lower() == "system":
-                message_from_user = f"[INST] {content.strip()} [/INST]"
-                prompt += message_from_user
-            elif role.lower() == "user":
-                prompt += "</s>"
-                message_from_user = f"[INST] {content.strip()} [/INST]"
-                prompt += message_from_user
-        return prompt
+            if message.role == MessageRole.SYSTEM or message.role == MessageRole.USER:
+                inst_buffer.append(str(message.content).strip())
+            elif message.role == MessageRole.ASSISTANT:
+                text += "<s>[INST] " + "\n".join(inst_buffer) + " [/INST]"
+                text += " " + str(message.content).strip() + "</s>"
+                inst_buffer.clear()
+            else:
+                raise ValueError(f"Unknown message role {message.role}")
+
+        if len(inst_buffer) > 0:
+            text += "<s>[INST] " + "\n".join(inst_buffer) + " [/INST]"
+
+        return text
 
     def _completion_to_prompt(self, completion: str) -> str:
         return self._messages_to_prompt(

diff --git a/private_gpt/components/vector_store/vector_store_component.py b/private_gpt/components/vector_store/vector_store_component.py
@@ -4,10 +4,10 @@
 from injector import inject, singleton
 from llama_index.core.indices.vector_store import VectorIndexRetriever, VectorStoreIndex
 from llama_index.core.vector_stores.types import (
+    BasePydanticVectorStore,
     FilterCondition,
     MetadataFilter,
     MetadataFilters,
-    VectorStore,
 )
 
 from private_gpt.open_ai.extensions.context_filter import ContextFilter
@@ -32,7 +32,7 @@ def _doc_id_metadata_filter(
 @singleton
 class VectorStoreComponent:
     settings: Settings
-    vector_store: VectorStore
+    vector_store: BasePydanticVectorStore
 
     @inject
     def __init__(self, settings: Settings) -> None:
@@ -54,7 +54,7 @@ def __init__(self, settings: Settings) -> None:
                     )
 
                 self.vector_store = typing.cast(
-                    VectorStore,
+                    BasePydanticVectorStore,
                     PGVectorStore.from_params(
                         **settings.postgres.model_dump(exclude_none=True),
                         table_name="embeddings",
@@ -87,7 +87,7 @@ def __init__(self, settings: Settings) -> None:
                 )  # TODO
 
                 self.vector_store = typing.cast(
-                    VectorStore,
+                    BasePydanticVectorStore,
                     BatchedChromaVectorStore(
                         chroma_client=chroma_client, chroma_collection=chroma_collection
                     ),
@@ -115,7 +115,7 @@ def __init__(self, settings: Settings) -> None:
                         **settings.qdrant.model_dump(exclude_none=True)
                     )
                 self.vector_store = typing.cast(
-                    VectorStore,
+                    BasePydanticVectorStore,
                     QdrantVectorStore(
                         client=client,
                         collection_name="make_this_parameterizable_per_api_call",

diff --git a/private_gpt/settings/settings.py b/private_gpt/settings/settings.py
@@ -104,6 +104,17 @@ class LLMSettings(BaseModel):
         0.1,
         description="The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual.",
     )
+    prompt_style: Literal["default", "llama2", "tag", "mistral", "chatml"] = Field(
+        "llama2",
+        description=(
+            "The prompt style to use for the chat engine. "
+            "If `default` - use the default prompt style from the llama_index. It should look like `role: message`.\n"
+            "If `llama2` - use the llama2 prompt style from the llama_index. Based on `<s>`, `[INST]` and `<<SYS>>`.\n"
+            "If `tag` - use the `tag` prompt style. It should look like `<|role|>: message`. \n"
+            "If `mistral` - use the `mistral prompt style. It shoudl look like <s>[INST] {System Prompt} [/INST]</s>[INST] { UserInstructions } [/INST]"
+            "`llama2` is the historic behaviour. `default` might work better with your custom models."
+        ),
+    )
 
 
 class VectorstoreSettings(BaseModel):
@@ -117,18 +128,6 @@ class NodeStoreSettings(BaseModel):
 class LlamaCPPSettings(BaseModel):
     llm_hf_repo_id: str
     llm_hf_model_file: str
-    prompt_style: Literal["default", "llama2", "tag", "mistral", "chatml"] = Field(
-        "llama2",
-        description=(
-            "The prompt style to use for the chat engine. "
-            "If `default` - use the default prompt style from the llama_index. It should look like `role: message`.\n"
-            "If `llama2` - use the llama2 prompt style from the llama_index. Based on `<s>`, `[INST]` and `<<SYS>>`.\n"
-            "If `tag` - use the `tag` prompt style. It should look like `<|role|>: message`. \n"
-            "If `mistral` - use the `mistral prompt style. It shoudl look like <s>[INST] {System Prompt} [/INST]</s>[INST] { UserInstructions } [/INST]"
-            "`llama2` is the historic behaviour. `default` might work better with your custom models."
-        ),
-    )
-
     tfs_z: float = Field(
         1.0,
         description="Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.",
@@ -206,6 +205,19 @@ class OpenAISettings(BaseModel):
         "gpt-3.5-turbo",
         description="OpenAI Model to use. Example: 'gpt-4'.",
     )
+    request_timeout: float = Field(
+        120.0,
+        description="Time elapsed until openailike server times out the request. Default is 120s. Format is float. ",
+    )
+    embedding_api_base: str = Field(
+        None,
+        description="Base URL of OpenAI API. Example: 'https://api.openai.com/v1'.",
+    )
+    embedding_api_key: str
+    embedding_model: str = Field(
+        "text-embedding-ada-002",
+        description="OpenAI embedding Model to use. Example: 'text-embedding-3-large'.",
+    )
 
 
 class OllamaSettings(BaseModel):

diff --git a/pyproject.toml b/pyproject.toml
@@ -7,49 +7,52 @@ authors = ["Zylon <hi@zylon.ai>"]
 [tool.poetry.dependencies]
 python = ">=3.11,<3.12"
 # PrivateGPT
-fastapi = { extras = ["all"], version = "^0.110.0" }
+fastapi = { extras = ["all"], version = "^0.111.0" }
 python-multipart = "^0.0.9"
 injector = "^0.21.0"
 pyyaml = "^6.0.1"
-watchdog = "^4.0.0"
-transformers = "^4.38.2"
+watchdog = "^4.0.1"
+transformers = "^4.42.3"
 docx2txt = "^0.8"
 cryptography = "^3.1"
 # LlamaIndex core libs
-llama-index-core = "^0.10.14"
-llama-index-readers-file = "^0.1.6"
+llama-index-core = "^0.10.52"
+llama-index-readers-file = "^0.1.27"
 # Optional LlamaIndex integration libs
-llama-index-llms-llama-cpp = {version = "^0.1.3", optional = true}
-llama-index-llms-openai = {version = "^0.1.6", optional = true}
+llama-index-llms-llama-cpp = {version = "^0.1.4", optional = true}
+llama-index-llms-openai = {version = "^0.1.25", optional = true}
 llama-index-llms-openai-like = {version ="^0.1.3", optional = true}
-llama-index-llms-ollama = {version ="^0.1.2", optional = true}
-llama-index-llms-azure-openai = {version ="^0.1.5", optional = true}
+llama-index-llms-ollama = {version ="^0.1.5", optional = true}
+llama-index-llms-azure-openai = {version ="^0.1.8", optional = true}
 llama-index-embeddings-ollama = {version ="^0.1.2", optional = true}
-llama-index-embeddings-huggingface = {version ="^0.1.4", optional = true}
-llama-index-embeddings-openai = {version ="^0.1.6", optional = true}
-llama-index-embeddings-azure-openai = {version ="^0.1.6", optional = true}
-llama-index-vector-stores-qdrant = {version ="^0.1.3", optional = true}
-llama-index-vector-stores-chroma = {version ="^0.1.4", optional = true}
-llama-index-vector-stores-postgres = {version ="^0.1.2", optional = true}
+llama-index-embeddings-huggingface = {version ="^0.2.2", optional = true}
+llama-index-embeddings-openai = {version ="^0.1.10", optional = true}
+llama-index-embeddings-azure-openai = {version ="^0.1.10", optional = true}
+llama-index-vector-stores-qdrant = {version ="^0.2.10", optional = true}
+llama-index-vector-stores-chroma = {version ="^0.1.10", optional = true}
+llama-index-vector-stores-postgres = {version ="^0.1.11", optional = true}
 llama-index-vector-stores-clickhouse = {version ="^0.1.3", optional = true}
-llama-index-storage-docstore-postgres = {version ="^0.1.2", optional = true}
-llama-index-storage-index-store-postgres = {version ="^0.1.2", optional = true}
+llama-index-storage-docstore-postgres = {version ="^0.1.3", optional = true}
+llama-index-storage-index-store-postgres = {version ="^0.1.4", optional = true}
 # Postgres
 psycopg2-binary = {version ="^2.9.9", optional = true}
 asyncpg = {version="^0.29.0", optional = true}
 
 # ClickHouse
-clickhouse-connect = {version = "^0.7.8", optional = true}
+clickhouse-connect = {version = "^0.7.15", optional = true}
 
 # Optional Sagemaker dependency
-boto3 = {version ="^1.34.51", optional = true}
+boto3 = {version ="^1.34.139", optional = true}
+
+# Optional Qdrant client
+qdrant-client = {version ="^1.9.0", optional = true}
 
 # Optional Reranker dependencies
-torch = {version ="^2.1.2", optional = true}
-sentence-transformers = {version ="^2.6.1", optional = true}
+torch = {version ="^2.3.1", optional = true}
+sentence-transformers = {version ="^3.0.1", optional = true}
 
 # Optional UI
-gradio = {version ="^4.19.2", optional = true}
+gradio = {version ="^4.37.2", optional = true}
 
 [tool.poetry.extras]
 ui = ["gradio"]

diff --git a/settings-local.yaml b/settings-local.yaml
@@ -8,9 +8,9 @@ llm:
   max_new_tokens: 512
   context_window: 3900
   tokenizer: mistralai/Mistral-7B-Instruct-v0.2
+  prompt_style: "mistral"
 
 llamacpp:
-  prompt_style: "mistral"
   llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF
   llm_hf_model_file: mistral-7b-instruct-v0.2.Q4_K_M.gguf
 
@@ -24,4 +24,4 @@ vectorstore:
   database: qdrant
 
 qdrant:
-  path: local_data/private_gpt/qdrant
+  path: local_data/private_gpt/qdrant
diff --git a/settings-vllm.yaml b/settings-vllm.yaml
@@ -3,6 +3,9 @@ server:
 
 llm:
   mode: openailike
+  max_new_tokens: 512
+  tokenizer: mistralai/Mistral-7B-Instruct-v0.2
+  temperature: 0.1
 
 embedding:
   mode: huggingface
@@ -15,3 +18,4 @@ openai:
   api_base: http://localhost:8000/v1
   api_key: EMPTY
   model: facebook/opt-125m
+  request_timeout: 600.0
diff --git a/settings.yaml b/settings.yaml
@@ -5,7 +5,7 @@ server:
   env_name: ${APP_ENV:prod}
   port: ${PORT:8001}
   cors:
-    enabled: false
+    enabled: true
     allow_origins: ["*"]
     allow_methods: ["*"]
     allow_headers: ["*"]
@@ -36,6 +36,7 @@ ui:
 
 llm:
   mode: llamacpp
+  prompt_style: "mistral"
   # Should be matching the selected model
   max_new_tokens: 512
   context_window: 3900
@@ -60,7 +61,6 @@ clickhouse:
     database: embeddings
 
 llamacpp:
-  prompt_style: "mistral"
   llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF
   llm_hf_model_file: mistral-7b-instruct-v0.2.Q4_K_M.gguf
   tfs_z: 1.0            # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting
@@ -102,6 +102,7 @@ sagemaker:
 openai:
   api_key: ${OPENAI_API_KEY:}
   model: gpt-3.5-turbo
+  embedding_api_key: ${OPENAI_API_KEY:}
 
 ollama:
   llm_model: llama2

diff --git a/tests/test_prompt_helper.py b/tests/test_prompt_helper.py
@@ -69,17 +69,21 @@ def test_tag_prompt_style_format_with_system_prompt():
 def test_mistral_prompt_style_format():
     prompt_style = MistralPromptStyle()
     messages = [
-        ChatMessage(content="You are an AI assistant.", role=MessageRole.SYSTEM),
-        ChatMessage(content="Hello, how are you doing?", role=MessageRole.USER),
+        ChatMessage(content="A", role=MessageRole.SYSTEM),
+        ChatMessage(content="B", role=MessageRole.USER),
     ]
-
-    expected_prompt = (
-        "<s>[INST] You are an AI assistant. [/INST]</s>"
-        "[INST] Hello, how are you doing? [/INST]"
-    )
-
+    expected_prompt = "<s>[INST] A\nB [/INST]"
     assert prompt_style.messages_to_prompt(messages) == expected_prompt
 
+    messages2 = [
+        ChatMessage(content="A", role=MessageRole.SYSTEM),
+        ChatMessage(content="B", role=MessageRole.USER),
+        ChatMessage(content="C", role=MessageRole.ASSISTANT),
+        ChatMessage(content="D", role=MessageRole.USER),
+    ]
+    expected_prompt2 = "<s>[INST] A\nB [/INST] C</s><s>[INST] D [/INST]"
+    assert prompt_style.messages_to_prompt(messages2) == expected_prompt2
+
 
 def test_chatml_prompt_style_format():
     prompt_style = ChatMLPromptStyle()