Cinnamon · phv2312 · May 11, 2024 · May 12, 2024 · May 12, 2024 · May 13, 2024
diff --git a/docs/pages/app/index/file.md b/docs/pages/app/index/file.md
@@ -107,9 +107,9 @@ string rather than a string.
 
 ## Software infrastructure
 
-| Infra            | Access        | Schema                                                                                                                                                                                                                                                                                             | Ref                                                        |
-| ---------------- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------- |
-| SQL table Source | self.\_Source | - id (int): id of the source (auto)<br>- name (str): the name of the file<br>- path (str): the path of the file<br>- size (int): the file size in bytes<br>- text_length (int): the number of characters in the file (default 0)<br>- date_created (datetime): the time the file is created (auto) | This is SQLALchemy ORM class. Can consult                  |
-| SQL table Index  | self.\_Index  | - id (int): id of the index entry (auto)<br>- source_id (int): the id of a file in the Source table<br>- target_id: the id of the segment in docstore or vector store<br>- relation_type (str): if the link is "document" or "vector"                                                              | This is SQLAlchemy ORM class                               |
-| Vector store     | self.\_VS     | - self.\_VS.add: add the list of embeddings to the vector store (optionally associate metadata and ids)<br>- self.\_VS.delete: delete vector entries based on ids<br>- self.\_VS.query: get embeddings based on embeddings.                                                                        | kotaemon > storages > vectorstores > BaseVectorStore       |
-| Doc store        | self.\_DS     | - self.\_DS.add: add the segments to document stores<br>- self.\_DS.get: get the segments based on id<br>- self.\_DS.get_all: get all segments<br>- self.\_DS.delete: delete segments based on id                                                                                                  | kotaemon > storages > docstores > base > BaseDocumentStore |
+| Infra            | Access        | Schema                                                                                                                                                                                                                                                                                      | Ref                                                        |
+| ---------------- | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------- |
+| SQL table Source | self.\_Source | - id (int): id of the source (auto)<br>- name (str): the name of the file<br>- path (str): the path of the file<br>- size (int): the file size in bytes<br>- note (dict): allow extra optional information about the file<br>- date_created (datetime): the time the file is created (auto) | This is SQLALchemy ORM class. Can consult                  |
+| SQL table Index  | self.\_Index  | - id (int): id of the index entry (auto)<br>- source_id (int): the id of a file in the Source table<br>- target_id: the id of the segment in docstore or vector store<br>- relation_type (str): if the link is "document" or "vector"                                                       | This is SQLAlchemy ORM class                               |
+| Vector store     | self.\_VS     | - self.\_VS.add: add the list of embeddings to the vector store (optionally associate metadata and ids)<br>- self.\_VS.delete: delete vector entries based on ids<br>- self.\_VS.query: get embeddings based on embeddings.                                                                 | kotaemon > storages > vectorstores > BaseVectorStore       |
+| Doc store        | self.\_DS     | - self.\_DS.add: add the segments to document stores<br>- self.\_DS.get: get the segments based on id<br>- self.\_DS.get_all: get all segments<br>- self.\_DS.delete: delete segments based on id                                                                                           | kotaemon > storages > docstores > base > BaseDocumentStore |
diff --git a/libs/kotaemon/kotaemon/agents/rewoo/agent.py b/libs/kotaemon/kotaemon/agents/rewoo/agent.py
@@ -317,6 +317,14 @@ def stream(self, instruction: str, use_citation: bool = False):
         )
 
         print("Planner output:", planner_text_output)
+        # output planner to info panel
+        yield AgentOutput(
+            text="",
+            agent_type=self.agent_type,
+            status="thinking",
+            intermediate_steps=[{"planner_log": planner_text_output}],
+        )
+
         # Work
         worker_evidences, plugin_cost, plugin_token = self._get_worker_evidence(
             planner_evidences, evidence_level
@@ -326,7 +334,9 @@ def stream(self, instruction: str, use_citation: bool = False):
             worker_log += f"{plan}: {plans[plan]}\n"
             current_progress = f"{plan}: {plans[plan]}\n"
             for e in plan_to_es[plan]:
+                worker_log += f"#Action: {planner_evidences.get(e, None)}\n"
                 worker_log += f"{e}: {worker_evidences[e]}\n"
+                current_progress += f"#Action: {planner_evidences.get(e, None)}\n"
                 current_progress += f"{e}: {worker_evidences[e]}\n"
 
             yield AgentOutput(

diff --git a/libs/kotaemon/kotaemon/agents/tools/google.py b/libs/kotaemon/kotaemon/agents/tools/google.py
@@ -1,7 +1,7 @@
 from typing import AnyStr, Optional, Type
 from urllib.error import HTTPError
 
-from langchain.utilities import SerpAPIWrapper
+from langchain_community.utilities import SerpAPIWrapper
 from pydantic import BaseModel, Field
 
 from .base import BaseTool

diff --git a/libs/kotaemon/kotaemon/agents/tools/llm.py b/libs/kotaemon/kotaemon/agents/tools/llm.py
@@ -22,12 +22,16 @@ class LLMTool(BaseTool):
     )
     llm: BaseLLM
     args_schema: Optional[Type[BaseModel]] = LLMArgs
+    dummy_mode: bool = True
 
     def _run_tool(self, query: AnyStr) -> str:
         output = None
         try:
-            response = self.llm(query)
+            if not self.dummy_mode:
+                response = self.llm(query)
+            else:
+                response = None
         except ValueError:
             raise ToolException("LLM Tool call failed")
-        output = response.text
+        output = response.text if response else "<->"
         return output
diff --git a/libs/kotaemon/kotaemon/base/schema.py b/libs/kotaemon/kotaemon/base/schema.py
@@ -5,8 +5,8 @@
 from langchain.schema.messages import AIMessage as LCAIMessage
 from langchain.schema.messages import HumanMessage as LCHumanMessage
 from langchain.schema.messages import SystemMessage as LCSystemMessage
-from llama_index.bridge.pydantic import Field
-from llama_index.schema import Document as BaseDocument
+from llama_index.core.bridge.pydantic import Field
+from llama_index.core.schema import Document as BaseDocument
 
 if TYPE_CHECKING:
     from haystack.schema import Document as HaystackDocument
@@ -140,6 +140,7 @@ class LLMInterface(AIMessage):
     total_cost: float = 0
     logits: list[list[float]] = Field(default_factory=list)
     messages: list[AIMessage] = Field(default_factory=list)
+    logprobs: list[float] = []
 
 
 class ExtractorOutput(Document):

diff --git a/libs/kotaemon/kotaemon/contribs/promptui/ui/chat.py b/libs/kotaemon/kotaemon/contribs/promptui/ui/chat.py
@@ -133,9 +133,7 @@ def construct_chat_ui(
                     label="Output file", show_label=True, height=100
                 )
                 export_btn = gr.Button("Export")
-                export_btn.click(
-                    func_export_to_excel, inputs=None, outputs=exported_file
-                )
+                export_btn.click(func_export_to_excel, inputs=[], outputs=exported_file)
 
         with gr.Row():
             with gr.Column():

diff --git a/libs/kotaemon/kotaemon/contribs/promptui/ui/pipeline.py b/libs/kotaemon/kotaemon/contribs/promptui/ui/pipeline.py
@@ -91,7 +91,7 @@ def construct_pipeline_ui(
                 save_btn.click(func_save, inputs=params, outputs=history_dataframe)
                 load_params_btn = gr.Button("Reload params")
                 load_params_btn.click(
-                    func_load_params, inputs=None, outputs=history_dataframe
+                    func_load_params, inputs=[], outputs=history_dataframe
                 )
             history_dataframe.render()
             history_dataframe.select(
@@ -103,7 +103,7 @@ def construct_pipeline_ui(
             export_btn = gr.Button(
                 "Export (Result will be in Exported file next to Output)"
             )
-            export_btn.click(func_export, inputs=None, outputs=exported_file)
+            export_btn.click(func_export, inputs=[], outputs=exported_file)
         with gr.Row():
             with gr.Column():
                 if params:

diff --git a/libs/kotaemon/kotaemon/embeddings/openai.py b/libs/kotaemon/kotaemon/embeddings/openai.py
@@ -1,12 +1,34 @@
+from itertools import islice
 from typing import Optional
 
+import numpy as np
+import tiktoken
+from tenacity import retry, stop_after_attempt, wait_random_exponential
 from theflow.utils.modules import import_dotted_string
 
 from kotaemon.base import Param
 
 from .base import BaseEmbeddings, Document, DocumentWithEmbedding
 
 
+def split_text_by_chunk_size(text: str, chunk_size: int) -> list[list[int]]:
+    """Split the text into chunks of a given size
+
+    Args:
+        text: text to split
+        chunk_size: size of each chunk
+
+    Returns:
+        list of chunks (as tokens)
+    """
+    encoding = tiktoken.get_encoding("cl100k_base")
+    tokens = iter(encoding.encode(text))
+    result = []
+    while chunk := list(islice(tokens, chunk_size)):
+        result.append(chunk)
+    return result
+
+
 class BaseOpenAIEmbeddings(BaseEmbeddings):
     """Base interface for OpenAI embedding model, using the openai library.
 
@@ -32,6 +54,9 @@ class BaseOpenAIEmbeddings(BaseEmbeddings):
             "Only supported in `text-embedding-3` and later models."
         ),
     )
+    context_length: Optional[int] = Param(
+        8191, help="The maximum context length of the embedding model"
+    )
 
     @Param.auto(depends_on=["max_retries"])
     def max_retries_(self):
@@ -56,16 +81,42 @@ def openai_response(self, client, **kwargs):
     def invoke(
         self, text: str | list[str] | Document | list[Document], *args, **kwargs
     ) -> list[DocumentWithEmbedding]:
-        input_ = self.prepare_input(text)
+        input_doc = self.prepare_input(text)
         client = self.prepare_client(async_version=False)
-        resp = self.openai_response(
-            client, input=[_.text if _.text else " " for _ in input_], **kwargs
-        ).dict()
-        output_ = sorted(resp["data"], key=lambda x: x["index"])
-        return [
-            DocumentWithEmbedding(embedding=o["embedding"], content=i)
-            for i, o in zip(input_, output_)
-        ]
+
+        input_: list[str | list[int]] = []
+        splitted_indices = {}
+        for idx, text in enumerate(input_doc):
+            if self.context_length:
+                chunks = split_text_by_chunk_size(text.text or " ", self.context_length)
+                splitted_indices[idx] = (len(input_), len(input_) + len(chunks))
+                input_.extend(chunks)
+            else:
+                splitted_indices[idx] = (len(input_), len(input_) + 1)
+                input_.append(text.text)
+
+        resp = self.openai_response(client, input=input_, **kwargs).dict()
+        output_ = list(sorted(resp["data"], key=lambda x: x["index"]))
+
+        output = []
+        for idx, doc in enumerate(input_doc):
+            embs = output_[splitted_indices[idx][0] : splitted_indices[idx][1]]
+            if len(embs) == 1:
+                output.append(
+                    DocumentWithEmbedding(embedding=embs[0]["embedding"], content=doc)
+                )
+                continue
+
+            chunk_lens = [
+                len(_)
+                for _ in input_[splitted_indices[idx][0] : splitted_indices[idx][1]]
+            ]
+            vs: list[list[float]] = [_["embedding"] for _ in embs]
+            emb = np.average(vs, axis=0, weights=chunk_lens)
+            emb = emb / np.linalg.norm(emb)
+            output.append(DocumentWithEmbedding(embedding=emb.tolist(), content=doc))
+
+        return output
 
     async def ainvoke(
         self, text: str | list[str] | Document | list[Document], *args, **kwargs
@@ -118,6 +169,7 @@ def prepare_client(self, async_version: bool = False):
 
         return OpenAI(**params)
 
+    @retry(wait=wait_random_exponential(min=1, max=40), stop=stop_after_attempt(6))
     def openai_response(self, client, **kwargs):
         """Get the openai response"""
         params: dict = {
@@ -174,6 +226,7 @@ def prepare_client(self, async_version: bool = False):
 
         return AzureOpenAI(**params)
 
+    @retry(wait=wait_random_exponential(min=1, max=40), stop=stop_after_attempt(6))
     def openai_response(self, client, **kwargs):
         """Get the openai response"""
         params: dict = {

diff --git a/libs/kotaemon/kotaemon/indices/base.py b/libs/kotaemon/kotaemon/indices/base.py
@@ -3,7 +3,7 @@
 from abc import abstractmethod
 from typing import Any, Type
 
-from llama_index.node_parser.interface import NodeParser
+from llama_index.core.node_parser.interface import NodeParser
 
 from kotaemon.base import BaseComponent, Document, RetrievedDocument
 
@@ -32,7 +32,7 @@ class LlamaIndexDocTransformerMixin:
     Example:
         class TokenSplitter(LlamaIndexMixin, BaseSplitter):
             def _get_li_class(self):
-                from llama_index.text_splitter import TokenTextSplitter
+                from llama_index.core.text_splitter import TokenTextSplitter
                 return TokenTextSplitter
 
     To use this mixin, please:

diff --git a/libs/kotaemon/kotaemon/indices/extractors/doc_parsers.py b/libs/kotaemon/kotaemon/indices/extractors/doc_parsers.py
@@ -15,7 +15,7 @@ def __init__(
         super().__init__(llm=llm, nodes=nodes, **params)
 
     def _get_li_class(self):
-        from llama_index.extractors import TitleExtractor
+        from llama_index.core.extractors import TitleExtractor
 
         return TitleExtractor
 
@@ -30,6 +30,6 @@ def __init__(
         super().__init__(llm=llm, summaries=summaries, **params)
 
     def _get_li_class(self):
-        from llama_index.extractors import SummaryExtractor
+        from llama_index.core.extractors import SummaryExtractor
 
         return SummaryExtractor
diff --git a/libs/kotaemon/kotaemon/indices/ingests/files.py b/libs/kotaemon/kotaemon/indices/ingests/files.py
@@ -1,8 +1,8 @@
 from pathlib import Path
 from typing import Type
 
-from llama_index.readers import PDFReader
-from llama_index.readers.base import BaseReader
+from llama_index.core.readers.base import BaseReader
+from llama_index.readers.file import PDFReader
 
 from kotaemon.base import BaseComponent, Document, Param
 from kotaemon.indices.extractors import BaseDocParser

diff --git a/libs/kotaemon/kotaemon/indices/qa/citation.py b/libs/kotaemon/kotaemon/indices/qa/citation.py
@@ -103,7 +103,9 @@ def invoke(self, context: str, question: str):
             print("CitationPipeline: invoking LLM")
             llm_output = self.get_from_path("llm").invoke(messages, **llm_kwargs)
             print("CitationPipeline: finish invoking LLM")
-            if not llm_output.messages:
+            if not llm_output.messages or not llm_output.additional_kwargs.get(
+                "tool_calls"
+            ):
                 return None
             function_output = llm_output.additional_kwargs["tool_calls"][0]["function"][
                 "arguments"

diff --git a/libs/kotaemon/kotaemon/indices/rankings/__init__.py b/libs/kotaemon/kotaemon/indices/rankings/__init__.py
@@ -1,5 +1,13 @@
 from .base import BaseReranking
 from .cohere import CohereReranking
 from .llm import LLMReranking
+from .llm_scoring import LLMScoring
+from .llm_trulens import LLMTrulensScoring
 
-__all__ = ["CohereReranking", "LLMReranking", "BaseReranking"]
+__all__ = [
+    "CohereReranking",
+    "LLMReranking",
+    "LLMScoring",
+    "BaseReranking",
+    "LLMTrulensScoring",
+]
diff --git a/libs/kotaemon/kotaemon/indices/rankings/cohere.py b/libs/kotaemon/kotaemon/indices/rankings/cohere.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-import os
+from decouple import config
 
 from kotaemon.base import Document
 
@@ -9,8 +9,7 @@
 
 class CohereReranking(BaseReranking):
     model_name: str = "rerank-multilingual-v2.0"
-    cohere_api_key: str = os.environ.get("COHERE_API_KEY", "")
-    top_k: int = 1
+    cohere_api_key: str = config("COHERE_API_KEY", "")
 
     def run(self, documents: list[Document], query: str) -> list[Document]:
         """Use Cohere Reranker model to re-order documents
@@ -29,12 +28,13 @@ def run(self, documents: list[Document], query: str) -> list[Document]:
             return compressed_docs
 
         _docs = [d.content for d in documents]
-        results = cohere_client.rerank(
-            model=self.model_name, query=query, documents=_docs, top_n=self.top_k
+        response = cohere_client.rerank(
+            model=self.model_name, query=query, documents=_docs
         )
-        for r in results:
+        print("Cohere score", [r.relevance_score for r in response.results])
+        for r in response.results:
             doc = documents[r.index]
-            doc.metadata["relevance_score"] = r.relevance_score
+            doc.metadata["cohere_reranking_score"] = r.relevance_score
             compressed_docs.append(doc)
 
         return compressed_docs
diff --git a/libs/kotaemon/kotaemon/indices/rankings/llm_scoring.py b/libs/kotaemon/kotaemon/indices/rankings/llm_scoring.py
@@ -0,0 +1,54 @@
+from __future__ import annotations
+
+from concurrent.futures import ThreadPoolExecutor
+
+import numpy as np
+from langchain.output_parsers.boolean import BooleanOutputParser
+
+from kotaemon.base import Document
+
+from .llm import LLMReranking
+
+
+class LLMScoring(LLMReranking):
+    def run(
+        self,
+        documents: list[Document],
+        query: str,
+    ) -> list[Document]:
+        """Filter down documents based on their relevance to the query."""
+        filtered_docs: list[Document] = []
+        output_parser = BooleanOutputParser()
+
+        if self.concurrent:
+            with ThreadPoolExecutor() as executor:
+                futures = []
+                for doc in documents:
+                    _prompt = self.prompt_template.populate(
+                        question=query, context=doc.get_content()
+                    )
+                    futures.append(executor.submit(lambda: self.llm(_prompt)))
+
+                results = [future.result() for future in futures]
+        else:
+            results = []
+            for doc in documents:
+                _prompt = self.prompt_template.populate(
+                    question=query, context=doc.get_content()
+                )
+                results.append(self.llm(_prompt))
+
+        for result, doc in zip(results, documents):
+            score = np.exp(np.average(result.logprobs))
+            include_doc = output_parser.parse(result.text)
+            if include_doc:
+                doc.metadata["llm_reranking_score"] = score
+            else:
+                doc.metadata["llm_reranking_score"] = 1 - score
+            filtered_docs.append(doc)
+
+        # prevent returning empty result
+        if len(filtered_docs) == 0:
+            filtered_docs = documents[: self.top_k]
+
+        return filtered_docs