chore: testing

superagent-ai · Feb 25, 2024 · eeeb234 · eeeb234
1 parent 542d8db
commit eeeb234
Showing 1 changed file with 71 additions and 31 deletions.
diff --git a/dev/walkthrough.ipynb b/dev/walkthrough.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -35,26 +35,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
-     "ename": "JSONDecodeError",
-     "evalue": "Expecting value: line 1 column 1 (char 0)",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mJSONDecodeError\u001b[0m                           Traceback (most recent call last)",
-      "File \u001b[0;32m~/simonas/open-source/super-rag/.venv/lib/python3.11/site-packages/requests/models.py:971\u001b[0m, in \u001b[0;36mResponse.json\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m    970\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 971\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcomplexjson\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mloads\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    972\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m JSONDecodeError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m    973\u001b[0m     \u001b[38;5;66;03m# Catch JSON-related errors and raise as requests.JSONDecodeError\u001b[39;00m\n\u001b[1;32m    974\u001b[0m     \u001b[38;5;66;03m# This aliases json.JSONDecodeError and simplejson.JSONDecodeError\u001b[39;00m\n",
-      "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/json/__init__.py:346\u001b[0m, in \u001b[0;36mloads\u001b[0;34m(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m    343\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m object_hook \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m\n\u001b[1;32m    344\u001b[0m         parse_int \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m parse_float \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m\n\u001b[1;32m    345\u001b[0m         parse_constant \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m object_pairs_hook \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m kw):\n\u001b[0;32m--> 346\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_default_decoder\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdecode\u001b[49m\u001b[43m(\u001b[49m\u001b[43ms\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    347\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
-      "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/json/decoder.py:337\u001b[0m, in \u001b[0;36mJSONDecoder.decode\u001b[0;34m(self, s, _w)\u001b[0m\n\u001b[1;32m    333\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Return the Python representation of ``s`` (a ``str`` instance\u001b[39;00m\n\u001b[1;32m    334\u001b[0m \u001b[38;5;124;03mcontaining a JSON document).\u001b[39;00m\n\u001b[1;32m    335\u001b[0m \n\u001b[1;32m    336\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m--> 337\u001b[0m obj, end \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mraw_decode\u001b[49m\u001b[43m(\u001b[49m\u001b[43ms\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43midx\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m_w\u001b[49m\u001b[43m(\u001b[49m\u001b[43ms\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mend\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    338\u001b[0m end \u001b[38;5;241m=\u001b[39m _w(s, end)\u001b[38;5;241m.\u001b[39mend()\n",
-      "File \u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/json/decoder.py:355\u001b[0m, in \u001b[0;36mJSONDecoder.raw_decode\u001b[0;34m(self, s, idx)\u001b[0m\n\u001b[1;32m    354\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[0;32m--> 355\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m JSONDecodeError(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExpecting value\u001b[39m\u001b[38;5;124m\"\u001b[39m, s, err\u001b[38;5;241m.\u001b[39mvalue) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    356\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj, end\n",
-      "\u001b[0;31mJSONDecodeError\u001b[0m: Expecting value: line 1 column 1 (char 0)",
-      "\nDuring handling of the above exception, another exception occurred:\n",
-      "\u001b[0;31mJSONDecodeError\u001b[0m                           Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[6], line 34\u001b[0m\n\u001b[1;32m      4\u001b[0m payload \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m      5\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfiles\u001b[39m\u001b[38;5;124m\"\u001b[39m: [\n\u001b[1;32m      6\u001b[0m         {\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     29\u001b[0m     },\n\u001b[1;32m     30\u001b[0m }\n\u001b[1;32m     32\u001b[0m response \u001b[38;5;241m=\u001b[39m requests\u001b[38;5;241m.\u001b[39mpost(url, json\u001b[38;5;241m=\u001b[39mpayload)\n\u001b[0;32m---> 34\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43mresponse\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjson\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m)\n",
-      "File \u001b[0;32m~/simonas/open-source/super-rag/.venv/lib/python3.11/site-packages/requests/models.py:975\u001b[0m, in \u001b[0;36mResponse.json\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m    971\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m complexjson\u001b[38;5;241m.\u001b[39mloads(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtext, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m    972\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m JSONDecodeError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m    973\u001b[0m     \u001b[38;5;66;03m# Catch JSON-related errors and raise as requests.JSONDecodeError\u001b[39;00m\n\u001b[1;32m    974\u001b[0m     \u001b[38;5;66;03m# This aliases json.JSONDecodeError and simplejson.JSONDecodeError\u001b[39;00m\n\u001b[0;32m--> 975\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m RequestsJSONDecodeError(e\u001b[38;5;241m.\u001b[39mmsg, e\u001b[38;5;241m.\u001b[39mdoc, e\u001b[38;5;241m.\u001b[39mpos)\n",
-      "\u001b[0;31mJSONDecodeError\u001b[0m: Expecting value: line 1 column 1 (char 0)"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'success': True, 'index_name': 'simonas-serverless-1536'}\n"
      ]
     }
    ],
@@ -87,6 +75,7 @@
     "    \"encoder\": {\n",
     "        \"name\": \"text-embedding-ada-002\",\n",
     "        \"provider\": \"openai\",\n",
+    "        \"dimensions\": 1536\n",
     "    },\n",
     "}\n",
     "\n",
@@ -97,7 +86,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -108,9 +97,9 @@
       "    \"success\": true,\n",
       "    \"data\": [\n",
       "        {\n",
-      "            \"id\": \"79b04347-f56a-4987-8901-fe1415b592b3\",\n",
+      "            \"id\": \"19baf12d-cc24-44a7-bbb8-b3068e2217e3\",\n",
       "            \"doc_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
-      "            \"document_id\": \"doc_879d9ef3-e2c1-45ef-ae2f-396065b5f18e\",\n",
+      "            \"document_id\": \"doc_a4f19222-92af-40ae-ae82-bf7aa0f29bd7\",\n",
       "            \"content\": \"Results in table 5 show that element-based chunking strategies o\\ufb00er the best question-answering accuracy, which is consistent with page retrieval and para- graph retrieval accuracy. Lastly, our approach stands out for its e\\ufb03ciency. Not only is element-based chunking generalizable without the need to select the chunk size, but when com- pared to the aggregation results that yield the highest retrieval scores. Element- based chunking achieves the highest retrieval scores with only half the number of chunks required compared to methods that do not consider the structure of the documents (62,529 v.s. 112,155). This can reduce the indexing cost and im- prove query latency because there are only half as many vectors to index for the vectordb that stores the chunks. This underscores the e\\ufb00ectiveness of our solu- tion in optimizing the balance between performance and computational resource requirements.\",\n",
       "            \"source\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
       "            \"source_type\": \".pdf\",\n",
@@ -119,7 +108,7 @@
       "            \"token_count\": 188,\n",
       "            \"page_number\": 9,\n",
       "            \"metadata\": {\n",
-      "                \"filename\": \"tmpsaw15pz9.pdf\",\n",
+      "                \"filename\": \"tmpq96h17zo.pdf\",\n",
       "                \"filetype\": \"application/pdf\",\n",
       "                \"languages\": [\n",
       "                    \"eng\"\n",
@@ -129,9 +118,9 @@
       "            \"dense_embedding\": null\n",
       "        },\n",
       "        {\n",
-      "            \"id\": \"1adb618f-2b6e-45cb-8ea8-a622d9498dc7\",\n",
+      "            \"id\": \"20340d3d-f6f0-4dde-98e7-8193c77be46e\",\n",
       "            \"doc_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
-      "            \"document_id\": \"doc_7778c117-6100-423b-9c33-55acf42329e6\",\n",
+      "            \"document_id\": \"doc_a4f19222-92af-40ae-ae82-bf7aa0f29bd7\",\n",
       "            \"content\": \"More speci\\ufb01cally on document chunking methods for RAG, there are stan- dard approaches being considered such as chunking text into spans of a given token length (e.g. 128 and 256) or chunking based on sentences. Open source projects already allow simple processing of documents (e.g. Unstructured4, Lla- maindex5 or Langchain 6), without explicitly considering the table structure on which these chunking strategies are applied. Even though di\\ufb00erent approaches are available, an exhaustive evaluation of chunking applied to RAG and speci\\ufb01cally to \\ufb01nancial reporting, except for some limited chunking analysis [14,36], is non-existent. In our work, we compare a broad range of chunking approaches in addition to more simple ones and provide an analysis of the outcomes of di\\ufb00erent methods when asking questions about di\\ufb00erent aspects of the reports.\",\n",
       "            \"source\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
       "            \"source_type\": \".pdf\",\n",
@@ -140,7 +129,7 @@
       "            \"token_count\": 199,\n",
       "            \"page_number\": 3,\n",
       "            \"metadata\": {\n",
-      "                \"filename\": \"tmpxr1ag3dr.pdf\",\n",
+      "                \"filename\": \"tmpq96h17zo.pdf\",\n",
       "                \"filetype\": \"application/pdf\",\n",
       "                \"languages\": [\n",
       "                    \"eng\"\n",
@@ -150,9 +139,51 @@
       "            \"dense_embedding\": null\n",
       "        },\n",
       "        {\n",
-      "            \"id\": \"cb1a0fd4-e5ad-4cb4-b5ff-5da531934fab\",\n",
+      "            \"id\": \"90e306b2-5b1b-42ef-9c5c-fec52a073f84\",\n",
+      "            \"doc_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
+      "            \"document_id\": \"doc_a4f19222-92af-40ae-ae82-bf7aa0f29bd7\",\n",
+      "            \"content\": \"9 10 the lowest paragraph-level scores among all. On the other hand, element-based chunking strategies showed more consistent results. A fascinating discovery is that when various chunking strategies are com- bined, it results in enhanced retrieval scores, achieving superior performance at both the page level (84.4%) and paragraph level (with ROGUE at 0.568% and BLEU at 0.452%). This \\ufb01nding addresses an unresolved question: how to improve the accuracy of RAG. The element based method provides the highest scores and it also provides a mechanism to chunk documents without the need to \\ufb01ne tune hyper-parameters like the number of tokens in a chunk. This suggests the element based method is more generalizable and can be applied to new types of documents.\",\n",
+      "            \"source\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
+      "            \"source_type\": \".pdf\",\n",
+      "            \"chunk_index\": 12,\n",
+      "            \"title\": \"Table 3. Chunks statistics for basic chunking elements and Unstructured elements\",\n",
+      "            \"token_count\": 160,\n",
+      "            \"page_number\": 9,\n",
+      "            \"metadata\": {\n",
+      "                \"filename\": \"tmpq96h17zo.pdf\",\n",
+      "                \"filetype\": \"application/pdf\",\n",
+      "                \"languages\": [\n",
+      "                    \"eng\"\n",
+      "                ],\n",
+      "                \"parent_id\": \"53ffedc9520f52ef2c8e4568301c8530\"\n",
+      "            },\n",
+      "            \"dense_embedding\": null\n",
+      "        },\n",
+      "        {\n",
+      "            \"id\": \"07ea8e9d-e989-4b4a-a1e9-14506b5c82ff\",\n",
       "            \"doc_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
-      "            \"document_id\": \"doc_879d9ef3-e2c1-45ef-ae2f-396065b5f18e\",\n",
+      "            \"document_id\": \"doc_a4f19222-92af-40ae-ae82-bf7aa0f29bd7\",\n",
+      "            \"content\": \"Retrieval Accuracy Secondly, we evaluate the capabilities of each chunking strategy in terms of retrieval accuracy. We use the page numbers in the ground truth to calculate the page-level retrieval accuracy, and we use ROGUE [24] and BLEU [32] scores to evaluate the accuracy of paragraph-level retrieval compared to the ground truth evidence paragraphs. As shown in Table 4, when compared to Unstructured element-based chunk- ing strategies, basic chunking strategies seem to have higher page-level retrieval accuracy but lower paragraph-level accuracy on average. Additionally, basic chunking strategies also lack consistency between page-level and paragraph-level accuracy; higher page-level accuracy doesn\\u2019t ensure higher paragraph-level ac- curacy. For example, Base 128 has the second highest page-level accuracy but\",\n",
+      "            \"source\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
+      "            \"source_type\": \".pdf\",\n",
+      "            \"chunk_index\": 12,\n",
+      "            \"title\": \"Table 3. Chunks statistics for basic chunking elements and Unstructured elements\",\n",
+      "            \"token_count\": 154,\n",
+      "            \"page_number\": 9,\n",
+      "            \"metadata\": {\n",
+      "                \"filename\": \"tmpq96h17zo.pdf\",\n",
+      "                \"filetype\": \"application/pdf\",\n",
+      "                \"languages\": [\n",
+      "                    \"eng\"\n",
+      "                ],\n",
+      "                \"parent_id\": \"53ffedc9520f52ef2c8e4568301c8530\"\n",
+      "            },\n",
+      "            \"dense_embedding\": null\n",
+      "        },\n",
+      "        {\n",
+      "            \"id\": \"d8eca566-f0ab-4a36-a674-3f0b99bf3807\",\n",
+      "            \"doc_url\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
+      "            \"document_id\": \"doc_a4f19222-92af-40ae-ae82-bf7aa0f29bd7\",\n",
       "            \"content\": \"Chunking strategy Base 128 Base 256 Base 512 Keywords Chipper Summary Chipper Pre\\ufb01x & Table Description Chipper Furthermore, we would like to study the impact of RAG con\\ufb01guration and ele- meant type based chunking.\",\n",
       "            \"source\": \"https://arxiv.org/pdf/2402.05131.pdf\",\n",
       "            \"source_type\": \".pdf\",\n",
@@ -161,7 +192,7 @@
       "            \"token_count\": 53,\n",
       "            \"page_number\": 1,\n",
       "            \"metadata\": {\n",
-      "                \"filename\": \"tmpsaw15pz9.pdf\",\n",
+      "                \"filename\": \"tmpq96h17zo.pdf\",\n",
       "                \"filetype\": \"application/pdf\",\n",
       "                \"languages\": [\n",
       "                    \"eng\"\n",
@@ -199,6 +230,7 @@
     "query_response = requests.post(query_url, json=query_payload)\n",
     "\n",
     "# NOTE: Filter out fields before given to LLM\n",
+    "# Include title, content, source, page_number, chunk_index\n",
     "formatted_json = json.dumps(query_response.json(), indent=4)\n",
     "print(formatted_json)"
    ]
@@ -285,9 +317,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'detail': [{'type': 'missing', 'loc': ['body', 'files'], 'msg': 'Field required', 'input': {'file_url': 'https://arxiv.org/pdf/2210.03629.pdf', 'vector_database': {'type': 'pinecone', 'config': {'api_key': 'f4adc79e-ad40-4426-a78a-9878e2ed4a79', 'host': 'https://simonas-serverless-1536-75c816a.svc.apw5-4e34-81fa.pinecone.io'}}, 'index_name': 'simonas-serverless-1536', 'encoder': 'openai'}, 'url': 'https://errors.pydantic.dev/2.6/v/missing'}, {'type': 'model_attributes_type', 'loc': ['body', 'encoder'], 'msg': 'Input should be a valid dictionary or object to extract fields from', 'input': 'openai', 'url': 'https://errors.pydantic.dev/2.6/v/model_attributes_type'}]}\n"
+     ]
+    }
+   ],
    "source": [
     "# Delete the index\n",
     "query_url = f\"{API_URL}/api/v1/delete\"\n",
@@ -302,7 +342,7 @@
     "        }\n",
     "    },\n",
     "    \"index_name\": PINECONE_INDEX,\n",
-    "    \"encoder\": \"cohere\",\n",
+    "    \"encoder\": \"openai\",\n",
     "}\n",
     "\n",
     "delete_response = requests.delete(query_url, json=delete_payload)\n",