From c8966c6e025afd8a222bc19e1ef4dd2310434805 Mon Sep 17 00:00:00 2001
From: Chris Steege <steegecas@gmail.com>
Date: Thu, 9 Mar 2023 18:23:52 -0600
Subject: [PATCH 1/4] Changed if statement to check for no indexable documents
 and print message if verbose is True. Also made changes to query.py and
 document.py.

Co-authored-by: MindFlow <mf@mindflo.ai>
---
 mindflow/core/index.py          |   5 +-
 mindflow/core/query.py          | 100 +++++++++++++++-----------------
 mindflow/db/objects/document.py |   2 +-
 3 files changed, 52 insertions(+), 55 deletions(-)

diff --git a/mindflow/core/index.py b/mindflow/core/index.py
index 7bae2f3..2e314f1 100644
--- a/mindflow/core/index.py
+++ b/mindflow/core/index.py
@@ -49,8 +49,9 @@ def run_index(document_paths: List[str], refresh: bool, verbose: bool = True) ->
     indexable_document_references: List[DocumentReference] = return_if_indexable(
         document_references, refresh
     )
-    if not indexable_document_references and verbose:
-        print("No documents to index")
+    if not indexable_document_references:
+        if verbose:
+            print("No documents to index")
         return
 
     print_total_size(indexable_document_references)
diff --git a/mindflow/core/query.py b/mindflow/core/query.py
index 928acaf..e64272b 100644
--- a/mindflow/core/query.py
+++ b/mindflow/core/query.py
@@ -69,30 +69,29 @@ def select_content(
     """
     This function is used to generate a prompt based on a question or summarization task
     """
-    embedding_ranked_document_chunks: List[
+    ranked_document_chunks: List[
         DocumentChunk
     ] = rank_document_chunks_by_embedding(query, resolved, embedding_model)
-    if len(embedding_ranked_document_chunks) == 0:
+    if len(ranked_document_chunks) == 0:
         print(
             "No index for requested hashes. Please generate index for passed content."
         )
         sys.exit(1)
 
     selected_content = trim_content(
-        embedding_ranked_document_chunks, completion_model, query
+        ranked_document_chunks, completion_model, query
     )
 
     return selected_content
 
-
 class DocumentChunk:
     """
     This class is used to store the chunks of a document.
     """
 
     def __init__(
-        self, path: str, start: int, end: int, embedding: Optional[np.ndarray] = None
-    ):
+        self, path: str, start: int, end: int, embedding: np.ndarray = None
+    ):  
         self.path = path
         self.start = start
         self.end = end
@@ -109,21 +108,9 @@ def from_search_tree(
         """
 
         stack = [document.search_tree]
-        chunks: List["DocumentChunk"] = [
-            cls(
-                document.path,
-                document.search_tree["start"],
-                document.search_tree["end"],
-            )
-        ]
-        embedding_response: Union[ModelError, np.ndarray] = embedding_model(
-            document.search_tree["summary"]
-        )
-        if isinstance(embedding_response, ModelError):
-            print(embedding_response.embedding_message)
-            return [], []
+        chunks: List["DocumentChunk"] = []
+        embeddings: List[np.ndarray] = []
 
-        embeddings: List[np.ndarray] = [embedding_response]
         rolling_summary: List[str] = []
         while stack:
             node = stack.pop()
@@ -131,17 +118,18 @@ def from_search_tree(
             if node["leaves"]:
                 for leaf in node["leaves"]:
                     stack.append(leaf)
-                    chunks.append(cls(document.path, leaf["start"], leaf["end"]))
-                    rolling_summary_embedding_response: Union[
-                        np.ndarray, ModelError
-                    ] = embedding_model(
-                        "\n\n".join(rolling_summary) + "\n\n" + leaf["summary"],
-                    )
-                    if isinstance(rolling_summary_embedding_response, ModelError):
-                        print(rolling_summary_embedding_response.embedding_message)
-                        continue
-                    embeddings.append(rolling_summary_embedding_response)
-            rolling_summary.pop()
+            else:
+                rolling_summary_embedding_response: Union[
+                    np.ndarray, ModelError
+                ] = embedding_model("\n\n".join(rolling_summary))
+                if isinstance(rolling_summary_embedding_response, ModelError):
+                    print(rolling_summary_embedding_response.embedding_message)
+                    continue
+
+                chunks.append(cls(document.path, node["start"], node["end"]))
+                embeddings.append(rolling_summary_embedding_response)
+
+                rolling_summary.pop()
 
         return chunks, embeddings
 
@@ -155,27 +143,34 @@ def trim_content(
     selected_content: str = ""
 
     for document_chunk in ranked_document_chunks:
-        if document_chunk:
-            with open(document_chunk.path, "r", encoding="utf-8") as file:
-                file.seek(document_chunk.start)
-                text = file.read(document_chunk.end - document_chunk.start)
-
-                # Perform a binary search to find the maximum amount of text that fits within the token limit
-                left, right = 0, len(text)
-                while left <= right:
-                    mid = (left + right) // 2
-                    if (
-                        get_token_count(model, query + selected_content + text[:mid])
-                        <= model.hard_token_limit - MinimumReservedLength.QUERY.value
-                    ):
-                        left = mid + 1
-                    else:
-                        right = mid - 1
-
-                # Add the selected text to the selected content
-                selected_content += text[:right]
+        with open(document_chunk.path, "r", encoding="utf-8") as file:
+            file.seek(document_chunk.start)
+            text = file.read(document_chunk.end - document_chunk.start)
+
+            selected_content += formated_chunk(document_chunk, text)
+
+            if get_token_count(model, query + selected_content) > model.hard_token_limit:
+                break
+    
+    # Perform a binary search to trim the selected content to fit within the token limit
+    left, right = 0, len(selected_content)
+    while left <= right:
+        mid = (left + right) // 2
+        if (
+            get_token_count(model, query + selected_content[:mid])
+            <= model.hard_token_limit - MinimumReservedLength.QUERY.value
+        ):
+            left = mid + 1
+        else:
+            right = mid - 1
+
+    # Trim the selected content to the new bounds
+    selected_content = selected_content[:right]
+
     return selected_content
 
+def formated_chunk(document_chunk: DocumentChunk, text: str) -> str:
+     return "Path: " + document_chunk.path + " Start: " + str(document_chunk.start) + " End: " + str(document_chunk.end) + " Text: " + text + "\n\n"
 
 def rank_document_chunks_by_embedding(
     query: str,
@@ -205,9 +200,10 @@ def rank_document_chunks_by_embedding(
                 for document in filtered_documents
             ]
             for future in as_completed(futures):
-                document_chunks, document_chunk_embeddings = future.result()
+                # Ordered together
+                document_chunks, embeddings = future.result()
                 similarities = cosine_similarity(
-                    prompt_embeddings, document_chunk_embeddings
+                    prompt_embeddings, embeddings
                 )[0]
                 ranked_document_chunks.extend(list(zip(document_chunks, similarities)))
 
diff --git a/mindflow/db/objects/document.py b/mindflow/db/objects/document.py
index 602e693..f9877ed 100644
--- a/mindflow/db/objects/document.py
+++ b/mindflow/db/objects/document.py
@@ -86,7 +86,7 @@ def from_resolved(
                 document_reference.id, document_reference.document_type
             )
             if not document_text:
-                print(f"Unable to read document text: {document_reference.id}")
+                ## print(f"Unable to read document text: {document_reference.id}")
                 continue
 
             document_text_bytes = document_text.encode("utf-8")

From d67790cae82de989d60f56fd732a64b778bcd174 Mon Sep 17 00:00:00 2001
From: Chris Steege <steegecas@gmail.com>
Date: Thu, 9 Mar 2023 18:24:20 -0600
Subject: [PATCH 2/4] Updated version number to 0.3.14.

Co-authored-by: MindFlow <mf@mindflo.ai>
---
 mindflow/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mindflow/__init__.py b/mindflow/__init__.py
index 8a3be2e..dc1bba8 100644
--- a/mindflow/__init__.py
+++ b/mindflow/__init__.py
@@ -1 +1 @@
-__version__ = "0.3.13"
+__version__ = "0.3.14"

From 87cae2017d71fc9144d9c0f3da723699963d7a48 Mon Sep 17 00:00:00 2001
From: Chris Steege <steegecas@gmail.com>
Date: Thu, 9 Mar 2023 18:25:27 -0600
Subject: [PATCH 3/4] Refactor code formatting to conform to PEP8 standards and
 improve readability. Also, rename functions and add type hints.

Co-authored-by: MindFlow <mf@mindflo.ai>
---
 mindflow/core/query.py | 40 +++++++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/mindflow/core/query.py b/mindflow/core/query.py
index e64272b..952abe4 100644
--- a/mindflow/core/query.py
+++ b/mindflow/core/query.py
@@ -69,29 +69,26 @@ def select_content(
     """
     This function is used to generate a prompt based on a question or summarization task
     """
-    ranked_document_chunks: List[
-        DocumentChunk
-    ] = rank_document_chunks_by_embedding(query, resolved, embedding_model)
+    ranked_document_chunks: List[DocumentChunk] = rank_document_chunks_by_embedding(
+        query, resolved, embedding_model
+    )
     if len(ranked_document_chunks) == 0:
         print(
             "No index for requested hashes. Please generate index for passed content."
         )
         sys.exit(1)
 
-    selected_content = trim_content(
-        ranked_document_chunks, completion_model, query
-    )
+    selected_content = trim_content(ranked_document_chunks, completion_model, query)
 
     return selected_content
 
+
 class DocumentChunk:
     """
     This class is used to store the chunks of a document.
     """
 
-    def __init__(
-        self, path: str, start: int, end: int, embedding: np.ndarray = None
-    ):  
+    def __init__(self, path: str, start: int, end: int, embedding: np.ndarray = None):
         self.path = path
         self.start = start
         self.end = end
@@ -149,9 +146,12 @@ def trim_content(
 
             selected_content += formated_chunk(document_chunk, text)
 
-            if get_token_count(model, query + selected_content) > model.hard_token_limit:
+            if (
+                get_token_count(model, query + selected_content)
+                > model.hard_token_limit
+            ):
                 break
-    
+
     # Perform a binary search to trim the selected content to fit within the token limit
     left, right = 0, len(selected_content)
     while left <= right:
@@ -169,8 +169,20 @@ def trim_content(
 
     return selected_content
 
+
 def formated_chunk(document_chunk: DocumentChunk, text: str) -> str:
-     return "Path: " + document_chunk.path + " Start: " + str(document_chunk.start) + " End: " + str(document_chunk.end) + " Text: " + text + "\n\n"
+    return (
+        "Path: "
+        + document_chunk.path
+        + " Start: "
+        + str(document_chunk.start)
+        + " End: "
+        + str(document_chunk.end)
+        + " Text: "
+        + text
+        + "\n\n"
+    )
+
 
 def rank_document_chunks_by_embedding(
     query: str,
@@ -202,9 +214,7 @@ def rank_document_chunks_by_embedding(
             for future in as_completed(futures):
                 # Ordered together
                 document_chunks, embeddings = future.result()
-                similarities = cosine_similarity(
-                    prompt_embeddings, embeddings
-                )[0]
+                similarities = cosine_similarity(prompt_embeddings, embeddings)[0]
                 ranked_document_chunks.extend(list(zip(document_chunks, similarities)))
 
     ranked_document_chunks.sort(key=lambda x: x[1], reverse=True)

From 5b6e4c8da672de8b2706f85c9ac5ab7188510772 Mon Sep 17 00:00:00 2001
From: Chris Steege <steegecas@gmail.com>
Date: Thu, 9 Mar 2023 18:28:43 -0600
Subject: [PATCH 4/4] Remove `embedding` parameter from `DocumentChunk` class
 `__init__` method.

Co-authored-by: MindFlow <mf@mindflo.ai>
---
 mindflow/core/query.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mindflow/core/query.py b/mindflow/core/query.py
index 952abe4..df84b27 100644
--- a/mindflow/core/query.py
+++ b/mindflow/core/query.py
@@ -88,11 +88,10 @@ class DocumentChunk:
     This class is used to store the chunks of a document.
     """
 
-    def __init__(self, path: str, start: int, end: int, embedding: np.ndarray = None):
+    def __init__(self, path: str, start: int, end: int):
         self.path = path
         self.start = start
         self.end = end
-        self.embedding = embedding
 
     @classmethod
     def from_search_tree(