fix: BaseDocumentChunk output fix

superagent-ai · Feb 25, 2024 · 490e46f · 490e46f
1 parent eeeb234
commit 490e46f
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 54 deletions.
diff --git a/dev/walkthrough.ipynb b/dev/walkthrough.ipynb
@@ -35,7 +35,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [
     {
@@ -66,7 +66,7 @@
     "    },\n",
     "    \"chunk_config\": {\n",
     "        \"partition_strategy\": \"auto\", # For tables use \"hi_res\"\n",
-    "        \"split_method\": \"semantic\",\n",
+    "        \"split_method\": \"semantic\", # or 'by_title'\n",
     "        \"min_chunk_tokens\": 50,\n",
     "        \"max_token_size\": 300,\n",
     "        \"rolling_window_size\": 1\n",
@@ -86,7 +86,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [
     {
@@ -103,9 +103,9 @@
       "            \"content\": \"Results in table 5 show that element-based chunking strategies o\\ufb00er the best question-answering accuracy, which is consistent with page retrieval and para- graph retrieval accuracy. Lastly, our approach stands out for its e\\ufb03ciency. Not only is element-based chunking generalizable without the need to select the chunk size, but when com- pared to the aggregation results that yield the highest retrieval scores. Element- based chunking achieves the highest retrieval scores with only half the number of chunks required compared to methods that do not consider the structure of the documents (62,529 v.s. 112,155). This can reduce the indexing cost and im- prove query latency because there are only half as many vectors to index for the vectordb that stores the chunks. This underscores the e\\ufb00ectiveness of our solu- tion in optimizing the balance between performance and computational resource requirements.\",\n",
       "            \"source\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
       "            \"source_type\": \".pdf\",\n",
-      "            \"chunk_index\": 12,\n",
+      "            \"chunk_index\": null,\n",
       "            \"title\": \"Table 3. Chunks statistics for basic chunking elements and Unstructured elements\",\n",
-      "            \"token_count\": 188,\n",
+      "            \"token_count\": null,\n",
       "            \"page_number\": 9,\n",
       "            \"metadata\": {\n",
       "                \"filename\": \"tmpq96h17zo.pdf\",\n",
@@ -124,9 +124,9 @@
       "            \"content\": \"More speci\\ufb01cally on document chunking methods for RAG, there are stan- dard approaches being considered such as chunking text into spans of a given token length (e.g. 128 and 256) or chunking based on sentences. Open source projects already allow simple processing of documents (e.g. Unstructured4, Lla- maindex5 or Langchain 6), without explicitly considering the table structure on which these chunking strategies are applied. Even though di\\ufb00erent approaches are available, an exhaustive evaluation of chunking applied to RAG and speci\\ufb01cally to \\ufb01nancial reporting, except for some limited chunking analysis [14,36], is non-existent. In our work, we compare a broad range of chunking approaches in addition to more simple ones and provide an analysis of the outcomes of di\\ufb00erent methods when asking questions about di\\ufb00erent aspects of the reports.\",\n",
       "            \"source\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
       "            \"source_type\": \".pdf\",\n",
-      "            \"chunk_index\": 3,\n",
+      "            \"chunk_index\": null,\n",
       "            \"title\": \"2 Related work\",\n",
-      "            \"token_count\": 199,\n",
+      "            \"token_count\": null,\n",
       "            \"page_number\": 3,\n",
       "            \"metadata\": {\n",
       "                \"filename\": \"tmpq96h17zo.pdf\",\n",
@@ -139,57 +139,15 @@
       "            \"dense_embedding\": null\n",
       "        },\n",
       "        {\n",
-      "            \"id\": \"90e306b2-5b1b-42ef-9c5c-fec52a073f84\",\n",
-      "            \"doc_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
-      "            \"document_id\": \"doc_a4f19222-92af-40ae-ae82-bf7aa0f29bd7\",\n",
-      "            \"content\": \"9 10 the lowest paragraph-level scores among all. On the other hand, element-based chunking strategies showed more consistent results. A fascinating discovery is that when various chunking strategies are com- bined, it results in enhanced retrieval scores, achieving superior performance at both the page level (84.4%) and paragraph level (with ROGUE at 0.568% and BLEU at 0.452%). This \\ufb01nding addresses an unresolved question: how to improve the accuracy of RAG. The element based method provides the highest scores and it also provides a mechanism to chunk documents without the need to \\ufb01ne tune hyper-parameters like the number of tokens in a chunk. This suggests the element based method is more generalizable and can be applied to new types of documents.\",\n",
-      "            \"source\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
-      "            \"source_type\": \".pdf\",\n",
-      "            \"chunk_index\": 12,\n",
-      "            \"title\": \"Table 3. Chunks statistics for basic chunking elements and Unstructured elements\",\n",
-      "            \"token_count\": 160,\n",
-      "            \"page_number\": 9,\n",
-      "            \"metadata\": {\n",
-      "                \"filename\": \"tmpq96h17zo.pdf\",\n",
-      "                \"filetype\": \"application/pdf\",\n",
-      "                \"languages\": [\n",
-      "                    \"eng\"\n",
-      "                ],\n",
-      "                \"parent_id\": \"53ffedc9520f52ef2c8e4568301c8530\"\n",
-      "            },\n",
-      "            \"dense_embedding\": null\n",
-      "        },\n",
-      "        {\n",
-      "            \"id\": \"07ea8e9d-e989-4b4a-a1e9-14506b5c82ff\",\n",
-      "            \"doc_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
-      "            \"document_id\": \"doc_a4f19222-92af-40ae-ae82-bf7aa0f29bd7\",\n",
-      "            \"content\": \"Retrieval Accuracy Secondly, we evaluate the capabilities of each chunking strategy in terms of retrieval accuracy. We use the page numbers in the ground truth to calculate the page-level retrieval accuracy, and we use ROGUE [24] and BLEU [32] scores to evaluate the accuracy of paragraph-level retrieval compared to the ground truth evidence paragraphs. As shown in Table 4, when compared to Unstructured element-based chunk- ing strategies, basic chunking strategies seem to have higher page-level retrieval accuracy but lower paragraph-level accuracy on average. Additionally, basic chunking strategies also lack consistency between page-level and paragraph-level accuracy; higher page-level accuracy doesn\\u2019t ensure higher paragraph-level ac- curacy. For example, Base 128 has the second highest page-level accuracy but\",\n",
-      "            \"source\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
-      "            \"source_type\": \".pdf\",\n",
-      "            \"chunk_index\": 12,\n",
-      "            \"title\": \"Table 3. Chunks statistics for basic chunking elements and Unstructured elements\",\n",
-      "            \"token_count\": 154,\n",
-      "            \"page_number\": 9,\n",
-      "            \"metadata\": {\n",
-      "                \"filename\": \"tmpq96h17zo.pdf\",\n",
-      "                \"filetype\": \"application/pdf\",\n",
-      "                \"languages\": [\n",
-      "                    \"eng\"\n",
-      "                ],\n",
-      "                \"parent_id\": \"53ffedc9520f52ef2c8e4568301c8530\"\n",
-      "            },\n",
-      "            \"dense_embedding\": null\n",
-      "        },\n",
-      "        {\n",
       "            \"id\": \"d8eca566-f0ab-4a36-a674-3f0b99bf3807\",\n",
       "            \"doc_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
       "            \"document_id\": \"doc_a4f19222-92af-40ae-ae82-bf7aa0f29bd7\",\n",
       "            \"content\": \"Chunking strategy Base 128 Base 256 Base 512 Keywords Chipper Summary Chipper Pre\\ufb01x & Table Description Chipper Furthermore, we would like to study the impact of RAG con\\ufb01guration and ele- meant type based chunking.\",\n",
       "            \"source\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
       "            \"source_type\": \".pdf\",\n",
-      "            \"chunk_index\": 1,\n",
+      "            \"chunk_index\": null,\n",
       "            \"title\": \"Financial Report Chunking for E\\ufb00ective Retrieval Augmented Generation\",\n",
-      "            \"token_count\": 53,\n",
+      "            \"token_count\": null,\n",
       "            \"page_number\": 1,\n",
       "            \"metadata\": {\n",
       "                \"filename\": \"tmpq96h17zo.pdf\",\n",

diff --git a/models/document.py b/models/document.py
@@ -39,11 +39,33 @@ def from_metadata(cls, metadata: dict):
             "token_count",
             "page_number",
         }
-        filtered_metadata = {k: v for k, v in metadata.items() if k not in exclude_keys}
+        # Prepare metadata for the constructor and for embedding into the object
+        constructor_metadata = {
+            k: v for k, v in metadata.items() if k not in exclude_keys
+        }
+        filtered_metadata = {
+            k: v for k, v in metadata.items() if k in exclude_keys and k != "chunk_id"
+        }
+
+        def to_int(value):
+            try:
+                return int(value) if str(value).isdigit() else None
+            except (TypeError, ValueError):
+                return None
+
+        chunk_index = to_int(metadata.get("chunk_index"))
+        token_count = to_int(metadata.get("token_count"))
+
+        # Remove explicitly passed keys from filtered_metadata to avoid duplication
+        for key in ["chunk_index", "token_count"]:
+            filtered_metadata.pop(key, None)
+
         return cls(
             id=metadata.get("chunk_id", ""),
-            **metadata,
-            metadata=filtered_metadata,
+            chunk_index=chunk_index,
+            token_count=token_count,
+            **filtered_metadata,  # Pass filtered metadata for constructor
+            metadata=constructor_metadata,  # Pass the rest as part of the metadata
             dense_embedding=metadata.get("values"),
         )
 

diff --git a/service/embedding.py b/service/embedding.py
@@ -27,6 +27,10 @@
 from utils.summarise import completion
 from vectordbs import get_vector_service
 
+# TODO: Add similarity score to the BaseDocumentChunk
+# TODO: Add relevance score to the BaseDocumentChunk
+# TODO: Add created_at date to the BaseDocumentChunk
+
 
 class EmbeddingService:
     def __init__(